TANDEM Analysis

Author

Julia Brake & Nick Sumpter

Published

November 29, 2024

Background

This project involves the use of the Nightingale metabolomic panel and Olink inflammation panel to analyze metabolomic/proteomic differences between those with TB, those with DM, and those with both TB and DM. Some of the TB and TB-DM samples were also measured at baseline and after 2 months of treatment for TB.

Code
# Loading packages.
library(psych)
library(DESeq2)
library(ggrepel)
library(RColorBrewer)
library(pheatmap)
library(limma)
library(volcano3D)
#library(volcano3Ddata)
library(readxl)
library(EnvStats)
library(ggcorrplot)
library(knitr)
library(factoextra)
library(OlinkAnalyze)
library(table1)
library(tidygraph)
library(igraph)
library(ggraph)
library(foreign)
library(haven)
library(iglu)
library(openxlsx)
#library(CorLevelPlot)
library(flashClust)
library(cowplot)
library(impute)
library(WGCNA)
library(reshape2)
library(fs)
library(tidyverse)

# Setting path variable to location of this analysis.
path1 <- "/Users/julia/Library/CloudStorage/OneDrive-Radboudumc/Data_analyses/Julia_TANDEM/"

# Setting working directory to path variable.
setwd(path1)

knitr::opts_chunk$set(message = F, warning = F)

Metadata

  • The initial list of 352 samples that was provided to for Olink measurements was the basis for merging to other data (template Samplemanifest_TANDEM_JB_T96_x plt.xlsx)
  • We also used a separate plate layout document to obtain the Sample IDs that were provided to Nightingale (Plate layout TANDEM Olink JB.xlsx)
  • We obtained the corresponding Patient IDs and timepoints from the PLASMA sheet of DATA SAMPEL TANDEM 2021.xlsx
  • We obtained the phenotype and HbA1c from a file provided by Ajie (211209_TANDEM_sample_list)
Code
# Reading in olink plate layout document from H drive for all 352 samples.
olink_layout <- read_excel(path(path1, "data/template Samplemanifest_TANDEM_JB_T96_x plt.xlsx"), col_types = "text") |> 
  select(SampleID, PlateID, WellID, PatientID = Subject_ID) |> 
  filter(!str_detect(WellID, "12"))

# Reading in plate layout document final sample list sheet.
final_samplelist <- read_excel(path(path1, "data/Plate layout TANDEM Olink JB.xlsx"), sheet = 3, col_types = "text") |> 
  select(SampleID = IdPlasma,
         PatientID = study_id,
         PlateID = `Location Olink plates`,
         PlateNumID = `Olink plate number`,
         ShipmentID = `IdSample shipment`)

# Confirming that the SampleID column is identical in both files (the missing value in final_samplelist became a 0 in olink file).
# tibble(ID1 = olink_layout$SampleID, ID2 = final_samplelist$SampleID) |>
#   filter(ID1 != ID2 | is.na(ID1) | is.na(ID2))

# Confirming that the PatientID columns are identical in both files except for missing values in final_samplelist. For missing values the olink document has the ShipmentID instead of PatientID.
# tmp <- tibble(ID1 = olink_layout$Subject_ID, ID2 = final_samplelist$PatientID) |>
#   filter(ID1 != ID2 | is.na(ID1) | is.na(ID2))
# 
# tmp2 <- olink_layout |>
#   filter(Subject_ID %in% tmp$ID1) |>
#   left_join(final_samplelist, by = "SampleID")

# Checking for duplicate sampleID - 1 sample was duplicated (ID 11101416). We can rely on the original metadata to clarify which patientID/sampleID combo was correct and exclude only the incorrect one. The tube itself had the correct SampleID on it.
# table(final_samplelist$SampleID)[which(!table(final_samplelist$SampleID) == 1)]

# Combining files.
all_samples <- olink_layout |> 
  select(-PatientID) |> 
  cbind(final_samplelist |> select(-SampleID, -PlateID)) |> 
  select(SampleID, PatientID, PlateNumID, ShipmentID, OlinkWellID = WellID, OlinkPlateID = PlateID)


# Reading in the basic metadata file that was sent with the samples from Indonesia.
original_metadata <- read_excel(path(path1, "data/DATA SAMPEL TANDEM 2021.xlsx"), sheet = "PLASMA", col_types = "text")

# Merging both files, this gives the correct patientID for each sampleID and also gives the timepoint.
all_samples_meta <- all_samples |> 
  left_join(original_metadata, by = c("SampleID" = "IdPlasma")) |>
  mutate(SampleID = case_when(SampleID == "11101416" & PatientID != IdPatient ~ "Exclude",
                              SampleID == "0" ~ "Exclude",
                              TRUE ~ SampleID),
         Timepoint = case_when(SampleID == "Exclude" ~ NA_character_,
                               Visit == "1" ~ "Baseline",
                               Visit %in% c("3", "4") ~ "Month_2"),
         PatientID = case_when(SampleID == "Exclude" ~ NA_character_,
                               TRUE ~ IdPatient),
         Location = case_when(SampleID == "Exclude" ~ NA_character_,
                              TRUE ~ Location),
         ShipmentID = case_when(SampleID == "Exclude" ~ NA_character_,
                                TRUE ~ ShipmentID)) |>
  select(SampleID:OlinkPlateID, Timepoint, Location)

# Obtaining phenotype and hba1c from one of the rds files made by Ajie.
samplelist_rds <- read_rds(path(path1, "data/211209_TANDEM_sample_list")) |> 
  filter(study_id %in% all_samples_meta$PatientID) |>
  select(PatientID = study_id, Phenotype = phenotype, HbA1c = lab_hba1c_result) |> 
  mutate(Phenotype = case_when(Phenotype == "TB-diabetes" ~ "TB-DM",
                               Phenotype == "TB-no-diabetes" ~ "TB",
                               Phenotype == "no-TB-diabetes" ~ "DM"),
         HbA1c_categories = case_when(HbA1c < 5.7 ~ "Normal (<5.7%)",
                                      HbA1c >= 5.7 & HbA1c < 6.4 ~ "Elevated (5.7%-6.4%)",
                                      HbA1c >= 6.5 & HbA1c < 10 ~ "Diabetes (>6.5%-9.9%)",
                                      HbA1c >= 10               ~ "Diabetes (>10%)"))

# Obtaining age, sex, bmi, medication, diabetes duration, and TB culture results from the other rds file made by Ajie.
# Extracting the columns of interest.
metadata_rds <- read_rds(path(path1, "data/211209_TANDEM_ComprehensiveMetadata")) |>
  filter(study_id %in% all_samples_meta$PatientID) |> 
  select(PatientID = study_id, Age = age_b, Sex = sex, Weight = bmi_weight, Height = bmi_height, redcap_event_name, Insulin1 = dm_drugs___1, Metformin1 = dm_drugs___2, Sulphonurea = dm_drugs___3, Insulin2 = dm_meds___1, Metformin2 = dm_meds___2, Statin = dm_meds___5, Steroids1 = started_steroids, Steroids2 = steroids_1, DM_duration = dm_when, cul_res_o, cul_res_o2, cul_res_o_3)

# Cleaning up drug information.
drugs <- metadata_rds |> 
  group_by(PatientID) |> 
  summarize(Insulin = sum(Insulin1 == "Checked" | Insulin2 == "Checked", na.rm = T),
            Metformin = sum(Metformin1 == "Checked" | Metformin2 == "Checked", na.rm = T),
            Sulphonurea = sum(Sulphonurea == "Checked", na.rm = T),
            Statin = sum(Statin == "Checked", na.rm = T),
            Steroids_Yes = sum(Steroids1 == "Ya" | Steroids2 == "Ya", na.rm = T),
            Steroids_No = sum(Steroids1 == "Tidak" | Steroids2 == "Tidak", na.rm = T)) |> 
  mutate(Insulin = case_when(Insulin == 0 ~ "No",
                             TRUE ~ "Yes"),
         Metformin = case_when(Metformin == 0 ~ "No",
                               TRUE ~ "Yes"),
         Sulphonurea = case_when(Sulphonurea == 0 ~ "No",
                                 TRUE ~ "Yes"),
         Statin = case_when(Statin == 0 ~ "No",
                            TRUE ~ "Yes"),
         Steroids = case_when(Steroids_Yes > 0 ~ "Yes",
                              Steroids_No > 0 ~ "No",
                              TRUE ~ "Unknown")) |> 
  select(-(Steroids_Yes:Steroids_No))

# Cleaning up TB culture information.
culture <- metadata_rds |> 
  mutate(Timepoint = case_when(is.na(redcap_event_name) | redcap_event_name == "0 month" ~ "Baseline_Culture",
                               str_detect(redcap_event_name, "^2 month") ~ "Month_2_Culture",
                               str_detect(redcap_event_name, "^6 month") ~ "Month_6_Culture",
                               str_detect(redcap_event_name, "^12 month") ~ "Month_12_Culture",
                               str_detect(redcap_event_name, "^18 month") ~ "Month_18_Culture")) |> 
  filter(!is.na(Timepoint)) |> 
  group_by(PatientID, Timepoint) |> 
  summarize(Positive = sum(cul_res_o == "Positive" | cul_res_o2 == "Positive" | cul_res_o_3 == "Positive", na.rm = T),
            Negative = sum(cul_res_o == "Negative" | cul_res_o2 == "Negative" | cul_res_o_3 == "Negative", na.rm = T)) |> 
  mutate(Culture_Result = case_when(Positive > 0 ~ "Positive",
                                    Negative > 0 ~ "Negative",
                                    TRUE ~ "Unknown")) |> 
  select(-Positive, -Negative) |> 
  pivot_wider(names_from = Timepoint, values_from = Culture_Result)

# Keeping all other columns and identifying duplicated samples.
metadata_rds2 <- metadata_rds |> 
  filter(is.na(redcap_event_name) | str_detect(redcap_event_name, "^0")) |> 
  select(PatientID:redcap_event_name, DM_duration)

dup_meta_rds <- metadata_rds2 |> 
  filter(duplicated(PatientID) | duplicated(PatientID, fromLast = T),
         redcap_event_name == "0 month")

# Excluding duplicated samples (preferentially keeping data from TB database), merging drug/culture information, then cleaning up the columns and calculating BMI.
metadata_rds_dedup <- metadata_rds2 |> 
  filter(!PatientID %in% dup_meta_rds$PatientID) |> 
  rbind(dup_meta_rds) |> 
  select(-redcap_event_name) |> 
  left_join(drugs, by = "PatientID") |> 
  left_join(culture, by = "PatientID") |> 
  mutate(DM_duration_years = case_when(DM_duration == "< 12 bulan" ~ "< 1",
                                       DM_duration == "1-5 tahun" ~ "1 - 5",
                                       DM_duration == "6-15 tahun" ~ "6 - 15",
                                       DM_duration == "15+ tahun" ~ "> 15"),
         Age = round(Age, 1),
         Weight = round(Weight, 1),
         BMI = Weight / ((Height / 100) ^ 2),
         BMI_categories = case_when(BMI < 18.5 ~ "Underweight (<18.5)",
                                    BMI >= 18.5 & BMI < 23.0 ~ "Normal (18.5-22.9)",
                                    BMI >= 23.0 & BMI < 25.0 ~ "Overweight (23.0-24.9)",
                                    BMI >= 25 ~ "Obese (>=25)")) |> 
  select(-DM_duration)

# Obtaining smoking status, waist-to-hip ratio, haemoglobin, IGRA, and comorbidity data from DM spreadsheet.
dm_excel <- read_excel(path(path1, "data/Sample ID and DM patients data merged.xlsx"), col_types = "text") |>
  filter(study_id %in% all_samples_meta$PatientID) |> 
  select(PatientID = study_id, Smoker = smokes, Waist = waist, Hip = hip, Haemoglobin = hb_result, IGRA = qft_result, IGRA2 = igra_status, dcangina, dcstroke, dcheartattack, dcheartbypass, eyecataract, eyeglaucoma, eyeblind, eyesight, Kidney_Failure = renal, Foot_amputation = dclost_limb) |> 
  mutate(Smoker = case_when(Smoker == "Tidak sama sekali" ~ "No",
                            Smoker == "Setiap hari" ~ "Yes",
                            Smoker == "Kurang dari setiap hari" ~ "Yes"),
         Waist = round(as.numeric(Waist), 1),
         Hip = round(as.numeric(Hip), 1),
         Haemoglobin = round(as.numeric(Haemoglobin), 1),
         Waist_to_Hip_Ratio = Waist / Hip,
         Angina = case_when(dcangina == "tidak" ~ "No",
                            dcangina == "ya" ~ "Yes",
                            dcangina == "tidak tahu" ~ "Unknown"),
         Stroke = case_when(dcstroke == "tidak" ~ "No",
                            dcstroke == "ya" ~ "Yes",
                            dcstroke == "tidak tahu" ~ "Unknown"),
         Heart_Attack = case_when(dcheartattack == "tidak" ~ "No",
                                  dcheartattack == "ya" ~ "Yes",
                                  dcheartattack == "tidak tahu" ~ "Unknown"),
         Heart_Bypass = case_when(dcheartbypass == "tidak" ~ "No",
                                  dcheartbypass == "ya" ~ "Yes",
                                  dcheartbypass == "tidak tahu" ~ "Unknown"),
         Eye_Cataract = case_when(eyecataract == "tidak" ~ "No",
                                  eyecataract == "ya" ~ "Yes",
                                  eyecataract == "tidak tahu" ~ "Unknown"),
         Eye_Glaucoma = case_when(eyeglaucoma == "tidak" ~ "No",
                                  eyeglaucoma == "ya" ~ "Yes",
                                  eyeglaucoma == "tidak tahu" ~ "Unknown"),
         Eye_Blind = case_when(eyeblind == "tidak" ~ "No",
                               eyeblind == "ya" ~ "Yes",
                               eyeblind == "tidak tahu" ~ "Unknown"),
         Eye_Sight = case_when(eyesight == "tidak" ~ "No",
                               eyesight == "ya" ~ "Yes",
                               eyesight == "tidak tahu" ~ "Unknown"),
         Kidney_Failure = case_when(Kidney_Failure == "tidak" ~ "No",
                                    Kidney_Failure == "ya" ~ "Yes",
                                    Kidney_Failure == "tidak tahu" ~ "Unknown"),
         Foot_amputation = case_when(Foot_amputation == "tidak" ~ "No",
                                     Foot_amputation == "ya" ~ "Yes")) |> 
  select(-(IGRA2:eyesight))

# Checking which IGRA column to keep above (IGRA column had all information as in IGRA2).
# tmp <- dm_excel |> 
#   filter(IGRA != IGRA2 | is.na(IGRA) | is.na(IGRA2))

# Obtaining smoking status, waist-to-hip ratio, haemoglobin, and eGFR from TB spreadsheet.
tb_excel <- read_excel(path(path1, "data/Sample ID and TB patients data merged.xlsx"), col_types = "text") |>
  filter(study_id %in% all_samples_meta$PatientID) |> 
  select(PatientID = study_id, Smoker = smokes, Waist = waist, Hip = hip, Haemoglobin = hb_result, eGFR = calc_egfr) |> 
  mutate(Smoker = case_when(Smoker == "Tidak sama sekali" ~ "No",
                            Smoker == "Setiap hari" ~ "Yes",
                            Smoker == "Kurang dari setiap hari" ~ "Yes"),
         Waist = round(as.numeric(Waist), 1),
         Hip = round(as.numeric(Hip), 1),
         Haemoglobin = round(as.numeric(Haemoglobin), 1),
         Waist_to_Hip_Ratio = Waist / Hip)

# Obtaining MTB strain and drug resistance information from WGS file
wgs_data <- read_excel(path(path1, "data/WGS_data_from Carolien.xlsx"), sheet = "Sheet1", col_types = "text") |> 
  mutate(Resistance = case_when(rifampicin != "-" & isoniazid != "-" ~ "Both",
                                rifampicin != "-" ~ "Rifampicin",
                                isoniazid != "-" ~ "Isoniazid",
                                TRUE ~ "None")) |> 
  select(PatientID = stnum, MTB_strain = name, Resistance) |> 
  filter(PatientID %in% all_samples_meta$PatientID)

# Obtaining CXR score and death data from Alit's metadata.
alit_table <- read_xls(path(path1, "data/Tandem Olink Metadata_Timika_Outcomes.xls"), col_types = "text") |> 
  select(PatientID = study_id, Timika_score = timika_score, Withdraw_reason = withdraw_reason) |> 
  mutate(Timika_score = as.numeric(Timika_score),
         Death = case_when(Withdraw_reason == "Death" ~ "Yes",
                           TRUE ~ "Unknown")) |> 
  select(-Withdraw_reason)

# Combining all information into a final metadata table and cleaning up columns.
all_samples_meta_final <- all_samples_meta |> 
  left_join(samplelist_rds, by = "PatientID") |> 
  left_join(metadata_rds_dedup, by = "PatientID") |> 
  left_join(alit_table, by = "PatientID") |> 
  left_join(wgs_data, by = "PatientID") |> 
  left_join(tb_excel, by = "PatientID") |> 
  left_join(dm_excel, by = "PatientID") |> 
  mutate(Smoker = case_when(!is.na(Smoker.x) ~ Smoker.x,
                            TRUE ~ Smoker.y),
         Waist = case_when(!is.na(Waist.x) ~ Waist.x,
                           TRUE ~ Waist.y),
         Hip = case_when(!is.na(Hip.x) ~ Hip.x,
                         TRUE ~ Hip.y),
         Haemoglobin = case_when(!is.na(Haemoglobin.x) ~ Haemoglobin.x,
                                 TRUE ~ Haemoglobin.y),
         Waist_to_Hip_Ratio = case_when(!is.na(Waist_to_Hip_Ratio.x) ~ Waist_to_Hip_Ratio.x,
                                        TRUE ~ Waist_to_Hip_Ratio.y),
         Treatment_Failure = case_when(Death == "Yes" | Month_6_Culture == "Positive" | Month_12_Culture == "Positive" | Month_18_Culture == "Positive" ~ "Yes",
                                       TRUE ~ "No")) |> 
  select(-Smoker.x, -Waist.x, -Hip.x, -Haemoglobin.x, -Waist_to_Hip_Ratio.x, -Smoker.y, -Waist.y, -Hip.y, -Haemoglobin.y, -Waist_to_Hip_Ratio.y, -Location)

# Validating TB phenotype (note that 11 patients had negative baseline cultures, but each appears to have a positive smear result or CXR score). Added a column for which samples this was.
tmp <- all_samples_meta_final |> 
  filter(Phenotype %in% c("TB", "TB-DM"),
         Baseline_Culture == "Negative")

# tmp2 <- read_rds(path("data/211209_TANDEM_ComprehensiveMetadata")) |>
#   filter(study_id %in% tmp$PatientID) |>
#   select(study_id, redcap_event_name, starts_with("smear_res"), starts_with("cul_res_o"))

all_samples_meta_final <- all_samples_meta_final |> 
  mutate(Double_Check_TB = case_when(PatientID %in% tmp$PatientID ~ "Check",
                                     TRUE ~ "Culture_Positive"))

# Validating DM phenotype.
# tmp <- all_samples_meta_final |> 
#   filter(Phenotype %in% c("DM", "TB-DM"),
#          HbA1c_categories != "Diabetes (>6.5%)")
# 
# tmp <- all_samples_meta_final |> 
#   filter(Phenotype %in% c("TB"),
#          HbA1c_categories == "Diabetes (>6.5%)")

# Validating Treatment_Failure column.
# tmp <- all_samples_meta_final |> 
#   filter(Treatment_Failure == "Yes")

# Filtering out missing values for timepoint. 
has_na <- is.na(all_samples_meta_final$Timepoint)
all_samples_meta_new <- all_samples_meta_final[!has_na, ]
rownames(all_samples_meta_new) <- NULL

# Cleaning up R environment.
rm(alit_table, all_samples, all_samples_meta, culture, dm_excel, drugs, dup_meta_rds, final_samplelist, metadata_rds, metadata_rds_dedup, metadata_rds2, olink_layout, original_metadata, samplelist_rds, tb_excel, tmp, wgs_data)

> Nightingale

The same 352 samples were sent to Nightingale for metabolomic analysis. Nightingale provides these results in a single Excel file which also has all QC related information.

Quality control

Nightingale performs a large amount of QC prior to sending the results. For sample QC, they don’t provide measurements for any sample that they excluded, such as those with solid material in the tube.

Samples which failed the Nightingale QC and biomarkers with > 25% below LOD were excluded. This excludes a total of 21 samples and 2 biomarkers.

Code
# Reading in raw Nightingale results file and adding metadata.
nightingale <- read_excel(path(path1, "data/Nightingale_Results_uncleaned.xlsx"), sheet = 1, skip = 10) |> 
  select(-(`...1`:`...2`)) |> 
  slice(-(1:4)) |> 
  rename(Sample = `Sample id`) |> 
  pivot_longer(!Sample, names_to = "Biomarker", values_to = "Concentration") |> 
  left_join(all_samples_meta_final, by = c("Sample" = "PlateNumID")) |> 
  filter(SampleID != "Exclude")

# Reading in sample QC sheet and adding to master file. In total 21 samples were flagged for exclusion by Nightingale. Resulting N = 331.
sample_qc <- read_excel(path(path1, "data/Nightingale_Results_uncleaned.xlsx"), sheet = 2, skip = 11) |> 
  select(-`Sample has remarks`) |> 
  slice(-(1:3)) |> 
  rename(Sample = `...2`) |> 
  filter(`Sample excluded` == "x")

nightingale_qc <- nightingale |> 
  left_join(sample_qc, by = "Sample") |> 
  filter(is.na(`Sample excluded`))

# Reading in biomarker QC sheet and adding to master file.
biomarker_qc <- read_excel(path(path1, "data/Nightingale_Results_uncleaned.xlsx"), sheet = 3, skip = 10) |> 
  select(-`...1`) |> 
  slice(-(1:4)) |> 
  rename(Sample = `Sample id`) |> 
  pivot_longer(cols = `Total-C`:`S-HDL-TG %`, names_to = "Biomarker", values_to = "QC") |> 
  filter(!is.na(QC))

nightingale_qc2 <- nightingale_qc |> 
  left_join(biomarker_qc, by = c("Sample", "Biomarker"))

#table(nightingale_qc2$QC)

# Calculating percentage below LOD per assay for each phenotype/timepoint.
below_LOD_data <- nightingale_qc2 |> 
  mutate(below_LOD = QC == "Below limit of quantification") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(below_LOD_percent = sum(below_LOD, na.rm = T) / n() * 100)

# Identifying all Assays with at least one group over 25% below LOD.
over_25 <- below_LOD_data |> 
  group_by(Biomarker) |> 
  summarize(N_over25 = sum(below_LOD_percent > 25)) |> 
  filter(N_over25 > 0)

# Plotting LOD data.
below_LOD_data |> 
  filter(Biomarker %in% over_25$Biomarker) |> 
  mutate(Biomarker = factor(Biomarker),
         Biomarker = reorder(Biomarker, below_LOD_percent, decreasing = T)) |> 
  ggplot(aes(x = Biomarker, y = below_LOD_percent, color = Phenotype, shape = Timepoint)) +
  geom_hline(yintercept = 25, color = "darkred", linetype = 2) +
  geom_point(size = 2) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 100)) +
  scale_color_manual(values = c("goldenrod3", "blue4", "deeppink3")) +
  labs(y = "Below LOD (%)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title.x = element_blank(),
        panel.grid.minor.y = element_blank())

Code
# Excluding assays with more than 25% of samples below LOD in all 5 phenotype groups.
nightingale_qc_final <- nightingale_qc2 |> 
  filter(!Biomarker %in% (over_25 |> filter(N_over25 == 5) |> pull(Biomarker)))

# Cleaning up R environment.
rm(below_LOD_data, sample_qc, biomarker_qc, nightingale, nightingale_qc, nightingale_qc2, over_25)

# Converting Concentration column to numeric and checking which remaining values become NA. Only 27 measurements are NA and all had a QC warning of some type. This is fine.
tmp <- nightingale_qc_final |> 
  mutate(numeric_conc = as.numeric(Concentration)) |> 
  filter(is.na(numeric_conc)) |> 
  select(Sample:Concentration, QC)

nightingale_qc_final <- nightingale_qc_final |> 
  mutate(Concentration = as.numeric(Concentration)) |> 
  filter(!is.na(Concentration))

Subsetting measurements into categories

Code
# Specifying subgroups of measurements for sub analyses.
all_biomarkers <- unique(nightingale_qc_final$Biomarker)
lipoproteins <- all_biomarkers[c(1:34, 40:41)]
other_lipids <- all_biomarkers[c(35, 37:39)]
fatty_acids <- all_biomarkers[43:51]
ratios <- all_biomarkers[c(36, 42, 52:60)]
amino_acids <- all_biomarkers[61:70]
glycolysis <- all_biomarkers[71:74]
ketones <- all_biomarkers[75:78]
other <- all_biomarkers[79:81]
lipoprotein_subclasses <- all_biomarkers[82:247]
Julias_selection <- all_biomarkers[c(1:31, 40:41, 71, 81)] 
Small_selection_lipids <- all_biomarkers[c(4, 6:7, 9, 40:41, 81)]
Small_selection_lipids2 <- all_biomarkers[c(7, 9, 40:41)]

PCA

All measurements at baseline

Code
# Baseline differences.
tmp <- nightingale_qc_final |> 
  select(SampleID, Timepoint, Phenotype, Biomarker, Concentration) |> 
  pivot_wider(names_from = Biomarker, values_from = Concentration) |> 
  drop_na() |> 
  mutate(across(!(SampleID:Phenotype), function(x) (x - mean(x, na.rm = T)) / sd(x, na.rm = T)),
         Timepoint = factor(Timepoint),
         Phenotype = factor(Phenotype)) |> 
  filter(Timepoint == "Baseline")

pca_res <- prcomp(tmp |> column_to_rownames("SampleID") |> select(!(Timepoint:Phenotype)))

pca_trans <- cbind(as.data.frame(pca_res$x), tmp |> select(SampleID:Phenotype))

percent <- round(as.numeric(summary(pca_res)$importance[2,]) * 100, 1)

pca_trans |> 
  ggplot(aes(x = PC1, y = PC2, color = Phenotype)) +
  geom_point(size = 3) +
  stat_ellipse() +
  labs(title = "Nightingale measurements at baseline",
       x = paste0("PC1 [", percent[1], "%]"),
       y = paste0("PC2 [", percent[2], "%]")) +
  scale_color_manual(values = c("goldenrod3", "blue4", "deeppink3")) +
  theme_bw() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

All measurements at baseline (excluding subclasses)

Code
# Baseline differences.
tmp <- nightingale_qc_final |> 
  filter(!Biomarker %in% lipoprotein_subclasses) |> 
  select(SampleID, Timepoint, Phenotype, Biomarker, Concentration) |> 
  pivot_wider(names_from = Biomarker, values_from = Concentration) |> 
  drop_na() |> 
  mutate(across(!(SampleID:Phenotype), function(x) (x - mean(x, na.rm = T)) / sd(x, na.rm = T)),
         Timepoint = factor(Timepoint),
         Phenotype = factor(Phenotype)) |> 
  filter(Timepoint == "Baseline")

pca_res <- prcomp(tmp |> column_to_rownames("SampleID") |> select(!(Timepoint:Phenotype)))

pca_trans <- cbind(as.data.frame(pca_res$x), tmp |> select(SampleID:Phenotype))

percent <- round(as.numeric(summary(pca_res)$importance[2,]) * 100, 1)

pca_trans |> 
  ggplot(aes(x = PC1, y = PC2, color = Phenotype)) +
  geom_point(size = 3) +
  stat_ellipse() +
  labs(title = "Nightingale measurements at baseline",
       x = paste0("PC1 [", percent[1], "%]"),
       y = paste0("PC2 [", percent[2], "%]")) +
  scale_color_manual(values = c("goldenrod3", "blue4", "deeppink3")) +
  theme_bw() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

Julia’s selection at baseline

Code
# Baseline differences.
tmp <- nightingale_qc_final |> 
  filter(Biomarker %in% Julias_selection) |> 
  select(SampleID, Timepoint, Phenotype, Biomarker, Concentration) |> 
  pivot_wider(names_from = Biomarker, values_from = Concentration) |> 
  drop_na() |> 
  mutate(across(!(SampleID:Phenotype), function(x) (x - mean(x, na.rm = T)) / sd(x, na.rm = T)),
         Timepoint = factor(Timepoint),
         Phenotype = factor(Phenotype)) |> 
  filter(Timepoint == "Baseline")

pca_res <- prcomp(tmp |> column_to_rownames("SampleID") |> select(!(Timepoint:Phenotype)))

pca_trans <- cbind(as.data.frame(pca_res$x), tmp |> select(SampleID:Phenotype))

percent <- round(as.numeric(summary(pca_res)$importance[2,]) * 100, 1)

pca_trans |> 
  ggplot(aes(x = PC1, y = PC2, color = Phenotype)) +
  geom_point(size = 3, alpha = 0.8) +
  stat_ellipse() +
  labs(title = "Nightingale measurements at baseline",
       x = paste0("PC1 [", percent[1], "%]"),
       y = paste0("PC2 [", percent[2], "%]")) +
  scale_color_manual(values = c("goldenrod3", "blue4", "deeppink3")) +
  theme_bw() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

Code
ggsave(filename = path(path1, "output/Nightingale_PCA_Julias_selection.svg"), height = 4, width = 5, units = "in")

All measurements at month 2

Code
# PCA of the entire metabolome.
tmp <- nightingale_qc_final |> 
  select(SampleID, Timepoint, Phenotype, Biomarker, Concentration) |> 
  pivot_wider(names_from = Biomarker, values_from = Concentration) |> 
  drop_na() |> 
  mutate(across(!(SampleID:Phenotype), function(x) (x - mean(x, na.rm = T)) / sd(x, na.rm = T)),
         Timepoint = factor(Timepoint),
         Phenotype = factor(Phenotype)) |> 
  filter(Timepoint == "Month_2")

pca_res <- prcomp(tmp |> column_to_rownames("SampleID") |> select(!(Timepoint:Phenotype)))

pca_trans <- cbind(as.data.frame(pca_res$x), tmp |> select(SampleID:Phenotype))

percent <- round(as.numeric(summary(pca_res)$importance[2,]) * 100, 1)

pca_trans |> 
  ggplot(aes(x = PC1, y = PC2, color = Phenotype)) +
  geom_point(size = 3) +
  stat_ellipse() +
  labs(title = "Nightingale measurements at Month 2",
       x = paste0("PC1 [", percent[1], "%]"),
       y = paste0("PC2 [", percent[2], "%]")) +
  scale_color_manual(values = c("blue4", "deeppink3")) +
  theme_bw() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

All measurements at month 2 (excluding subclasses)

Code
# Baseline differences.
tmp <- nightingale_qc_final |> 
  filter(!Biomarker %in% lipoprotein_subclasses) |> 
  select(SampleID, Timepoint, Phenotype, Biomarker, Concentration) |> 
  pivot_wider(names_from = Biomarker, values_from = Concentration) |> 
  drop_na() |> 
  mutate(across(!(SampleID:Phenotype), function(x) (x - mean(x, na.rm = T)) / sd(x, na.rm = T)),
         Timepoint = factor(Timepoint),
         Phenotype = factor(Phenotype)) |> 
  filter(Timepoint == "Month_2")

pca_res <- prcomp(tmp |> column_to_rownames("SampleID") |> select(!(Timepoint:Phenotype)))

pca_trans <- cbind(as.data.frame(pca_res$x), tmp |> select(SampleID:Phenotype))

percent <- round(as.numeric(summary(pca_res)$importance[2,]) * 100, 1)

pca_trans |> 
  ggplot(aes(x = PC1, y = PC2, color = Phenotype)) +
  geom_point(size = 3) +
  stat_ellipse() +
  labs(title = "Nightingale measurements at month 2",
       x = paste0("PC1 [", percent[1], "%]"),
       y = paste0("PC2 [", percent[2], "%]")) +
  scale_color_manual(values = c("blue4", "deeppink3")) +
  theme_bw() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

Table metadata

Code
# create a df with unique SampleIDs for Olink
unique_patients_Nightingale <- nightingale_qc_final %>%
                          distinct(SampleID, .keep_all = TRUE)

table <- table1(~ Age + 
                  Sex +
                  HbA1c +
                  HbA1c_categories +
                  Weight + BMI + BMI_categories + Waist_to_Hip_Ratio +
                  Insulin +
                  Metformin +
                  Sulphonurea +
                  Statin +
                  Steroids +
                  Month_6_Culture + Month_12_Culture + Month_18_Culture + 
                  DM_duration_years + 
                  Timika_score +
                  Haemoglobin +
                  Death +
                  Treatment_Failure +
                  MTB_strain +
                  Resistance + 
                  Heart_Bypass +
                  Eye_Cataract +
                  Eye_Glaucoma +
                  Eye_Blind + 
                  Eye_Sight +
                  Smoker
                  | Phenotype + Timepoint, data = unique_patients_Nightingale)
table 
DM
TB
TB-DM
Overall
Baseline
(N=93)
Baseline
(N=91)
Month_2
(N=32)
Baseline
(N=83)
Month_2
(N=31)
Baseline
(N=267)
Month_2
(N=63)
Age
Mean (SD) 54.1 (11.7) 42.6 (13.4) 48.8 (8.63) 51.9 (10.3) 53.0 (8.91) 49.5 (12.9) 50.8 (8.95)
Median [Min, Max] 54.0 [25.0, 77.0] 41.7 [19.9, 84.4] 47.4 [35.3, 64.8] 50.3 [23.6, 75.1] 51.7 [33.1, 72.0] 49.2 [19.9, 84.4] 49.5 [33.1, 72.0]
Sex
Female 37 (39.8%) 42 (46.2%) 15 (46.9%) 39 (47.0%) 16 (51.6%) 118 (44.2%) 31 (49.2%)
Male 56 (60.2%) 49 (53.8%) 17 (53.1%) 44 (53.0%) 15 (48.4%) 149 (55.8%) 32 (50.8%)
HbA1c
Mean (SD) 9.40 (2.16) 5.52 (0.285) 5.50 (0.268) 11.2 (2.58) 11.5 (1.98) 8.63 (3.04) 8.46 (3.34)
Median [Min, Max] 9.20 [6.60, 15.1] 5.60 [4.70, 5.90] 5.60 [5.00, 5.90] 11.2 [6.60, 17.4] 11.4 [7.80, 15.9] 8.10 [4.70, 17.4] 5.90 [5.00, 15.9]
HbA1c_categories
Diabetes (>10%) 33 (35.5%) 0 (0%) 0 (0%) 51 (61.4%) 24 (77.4%) 84 (31.5%) 24 (38.1%)
Diabetes (>6.5%-9.9%) 60 (64.5%) 0 (0%) 0 (0%) 32 (38.6%) 7 (22.6%) 92 (34.5%) 7 (11.1%)
Elevated (5.7%-6.4%) 0 (0%) 35 (38.5%) 11 (34.4%) 0 (0%) 0 (0%) 35 (13.1%) 11 (17.5%)
Normal (<5.7%) 0 (0%) 56 (61.5%) 21 (65.6%) 0 (0%) 0 (0%) 56 (21.0%) 21 (33.3%)
Weight
Mean (SD) 63.4 (11.4) 46.4 (7.77) 48.3 (9.33) 52.5 (10.5) 52.2 (9.21) 54.2 (12.3) 50.3 (9.40)
Median [Min, Max] 62.7 [39.8, 97.0] 45.2 [31.6, 78.9] 47.2 [34.2, 78.9] 50.5 [35.8, 84.8] 52.3 [38.5, 72.6] 52.5 [31.6, 97.0] 49.2 [34.2, 78.9]
BMI
Mean (SD) 25.4 (4.02) 19.4 (3.14) 20.3 (3.86) 21.5 (3.82) 21.5 (3.87) 22.1 (4.45) 20.9 (3.87)
Median [Min, Max] 25.2 [16.6, 35.6] 18.9 [13.9, 33.3] 19.7 [15.4, 33.3] 20.8 [14.6, 32.8] 19.9 [16.1, 31.7] 21.3 [13.9, 35.6] 19.8 [15.4, 33.3]
BMI_categories
Normal (18.5-22.9) 23 (24.7%) 39 (42.9%) 15 (46.9%) 39 (47.0%) 15 (48.4%) 101 (37.8%) 30 (47.6%)
Obese (>=25) 49 (52.7%) 4 (4.4%) 4 (12.5%) 17 (20.5%) 8 (25.8%) 70 (26.2%) 12 (19.0%)
Overweight (23.0-24.9) 18 (19.4%) 9 (9.9%) 2 (6.3%) 8 (9.6%) 1 (3.2%) 35 (13.1%) 3 (4.8%)
Underweight (<18.5) 3 (3.2%) 39 (42.9%) 11 (34.4%) 19 (22.9%) 7 (22.6%) 61 (22.8%) 18 (28.6%)
Waist_to_Hip_Ratio
Mean (SD) 0.917 (0.0556) 0.814 (0.0582) 0.819 (0.0598) 0.871 (0.0670) 0.855 (0.0527) 0.868 (0.0736) 0.837 (0.0589)
Median [Min, Max] 0.921 [0.771, 1.06] 0.808 [0.684, 0.989] 0.809 [0.704, 0.922] 0.872 [0.691, 1.05] 0.857 [0.691, 0.935] 0.872 [0.684, 1.06] 0.831 [0.691, 0.935]
Insulin
No 68 (73.1%) 91 (100%) 32 (100%) 55 (66.3%) 16 (51.6%) 214 (80.1%) 48 (76.2%)
Yes 25 (26.9%) 0 (0%) 0 (0%) 28 (33.7%) 15 (48.4%) 53 (19.9%) 15 (23.8%)
Metformin
No 44 (47.3%) 91 (100%) 32 (100%) 14 (16.9%) 6 (19.4%) 149 (55.8%) 38 (60.3%)
Yes 49 (52.7%) 0 (0%) 0 (0%) 69 (83.1%) 25 (80.6%) 118 (44.2%) 25 (39.7%)
Sulphonurea
No 93 (100%) 91 (100%) 32 (100%) 63 (75.9%) 26 (83.9%) 247 (92.5%) 58 (92.1%)
Yes 0 (0%) 0 (0%) 0 (0%) 20 (24.1%) 5 (16.1%) 20 (7.5%) 5 (7.9%)
Statin
No 79 (84.9%) 91 (100%) 32 (100%) 81 (97.6%) 31 (100%) 251 (94.0%) 63 (100%)
Yes 14 (15.1%) 0 (0%) 0 (0%) 2 (2.4%) 0 (0%) 16 (6.0%) 0 (0%)
Steroids
No 89 (95.7%) 87 (95.6%) 32 (100%) 74 (89.2%) 31 (100%) 250 (93.6%) 63 (100%)
Unknown 1 (1.1%) 1 (1.1%) 0 (0%) 6 (7.2%) 0 (0%) 8 (3.0%) 0 (0%)
Yes 3 (3.2%) 3 (3.3%) 0 (0%) 3 (3.6%) 0 (0%) 9 (3.4%) 0 (0%)
Month_6_Culture
Negative 0 (0%) 80 (87.9%) 29 (90.6%) 69 (83.1%) 27 (87.1%) 149 (55.8%) 56 (88.9%)
Positive 0 (0%) 8 (8.8%) 1 (3.1%) 8 (9.6%) 3 (9.7%) 16 (6.0%) 4 (6.3%)
Unknown 0 (0%) 3 (3.3%) 2 (6.3%) 3 (3.6%) 1 (3.2%) 6 (2.2%) 3 (4.8%)
Missing 93 (100%) 0 (0%) 0 (0%) 3 (3.6%) 0 (0%) 96 (36.0%) 0 (0%)
Month_12_Culture
Negative 0 (0%) 71 (78.0%) 23 (71.9%) 65 (78.3%) 25 (80.6%) 136 (50.9%) 48 (76.2%)
Positive 0 (0%) 4 (4.4%) 4 (12.5%) 0 (0%) 0 (0%) 4 (1.5%) 4 (6.3%)
Unknown 0 (0%) 10 (11.0%) 3 (9.4%) 13 (15.7%) 5 (16.1%) 23 (8.6%) 8 (12.7%)
Missing 93 (100%) 6 (6.6%) 2 (6.3%) 5 (6.0%) 1 (3.2%) 104 (39.0%) 3 (4.8%)
Month_18_Culture
Negative 0 (0%) 64 (70.3%) 18 (56.3%) 55 (66.3%) 25 (80.6%) 119 (44.6%) 43 (68.3%)
Positive 0 (0%) 2 (2.2%) 2 (6.3%) 0 (0%) 0 (0%) 2 (0.7%) 2 (3.2%)
Unknown 0 (0%) 17 (18.7%) 10 (31.3%) 10 (12.0%) 1 (3.2%) 27 (10.1%) 11 (17.5%)
Missing 93 (100%) 8 (8.8%) 2 (6.3%) 18 (21.7%) 5 (16.1%) 119 (44.6%) 7 (11.1%)
DM_duration_years
< 1 15 (16.1%) 0 (0%) 0 (0%) 16 (19.3%) 8 (25.8%) 31 (11.6%) 8 (12.7%)
> 15 7 (7.5%) 0 (0%) 0 (0%) 2 (2.4%) 1 (3.2%) 9 (3.4%) 1 (1.6%)
1 - 5 37 (39.8%) 0 (0%) 0 (0%) 28 (33.7%) 7 (22.6%) 65 (24.3%) 7 (11.1%)
6 - 15 34 (36.6%) 0 (0%) 0 (0%) 13 (15.7%) 6 (19.4%) 47 (17.6%) 6 (9.5%)
Missing 0 (0%) 91 (100%) 32 (100%) 24 (28.9%) 9 (29.0%) 115 (43.1%) 41 (65.1%)
Timika_score
Mean (SD) NA (NA) 46.5 (30.3) 55.4 (33.5) 40.8 (28.0) 41.8 (27.9) 43.9 (29.3) 48.8 (31.4)
Median [Min, Max] NA [NA, NA] 48.3 [0, 123] 51.3 [0, 119] 30.0 [0, 128] 33.8 [0, 90.0] 37.9 [0, 128] 48.3 [0, 119]
Missing 93 (100%) 6 (6.6%) 2 (6.3%) 10 (12.0%) 3 (9.7%) 109 (40.8%) 5 (7.9%)
Haemoglobin
Mean (SD) 14.0 (1.91) 12.3 (1.67) 12.9 (1.47) 12.9 (1.80) 13.1 (1.96) 13.1 (1.92) 13.0 (1.72)
Median [Min, Max] 13.9 [7.90, 18.2] 12.6 [6.80, 15.8] 13.1 [9.40, 15.8] 13.0 [7.30, 17.2] 13.3 [9.00, 17.2] 13.1 [6.80, 18.2] 13.3 [9.00, 17.2]
Death
Unknown 93 (100%) 87 (95.6%) 31 (96.9%) 78 (94.0%) 30 (96.8%) 258 (96.6%) 61 (96.8%)
Yes 0 (0%) 4 (4.4%) 1 (3.1%) 5 (6.0%) 1 (3.2%) 9 (3.4%) 2 (3.2%)
Treatment_Failure
No 93 (100%) 75 (82.4%) 24 (75.0%) 70 (84.3%) 27 (87.1%) 238 (89.1%) 51 (81.0%)
Yes 0 (0%) 16 (17.6%) 8 (25.0%) 13 (15.7%) 4 (12.9%) 29 (10.9%) 12 (19.0%)
MTB_strain
East-Asian (Beijing) 0 (0%) 14 (15.4%) 7 (21.9%) 19 (22.9%) 9 (29.0%) 33 (12.4%) 16 (25.4%)
East-Asian (non-Beijing) 0 (0%) 3 (3.3%) 2 (6.3%) 3 (3.6%) 1 (3.2%) 6 (2.2%) 3 (4.8%)
Euro-American 0 (0%) 13 (14.3%) 7 (21.9%) 7 (8.4%) 3 (9.7%) 20 (7.5%) 10 (15.9%)
Euro-American (H37Rv-like) 0 (0%) 1 (1.1%) 0 (0%) 1 (1.2%) 0 (0%) 2 (0.7%) 0 (0%)
Euro-American (LAM) 0 (0%) 13 (14.3%) 4 (12.5%) 13 (15.7%) 4 (12.9%) 26 (9.7%) 8 (12.7%)
Euro-American (mainly T) 0 (0%) 5 (5.5%) 2 (6.3%) 1 (1.2%) 2 (6.5%) 6 (2.2%) 4 (6.3%)
Euro-American (X-type) 0 (0%) 13 (14.3%) 4 (12.5%) 15 (18.1%) 5 (16.1%) 28 (10.5%) 9 (14.3%)
Indo-Oceanic 0 (0%) 4 (4.4%) 0 (0%) 5 (6.0%) 4 (12.9%) 9 (3.4%) 4 (6.3%)
Missing 93 (100%) 25 (27.5%) 6 (18.8%) 19 (22.9%) 3 (9.7%) 137 (51.3%) 9 (14.3%)
Resistance
Both 0 (0%) 1 (1.1%) 0 (0%) 0 (0%) 1 (3.2%) 1 (0.4%) 1 (1.6%)
Isoniazid 0 (0%) 4 (4.4%) 3 (9.4%) 2 (2.4%) 1 (3.2%) 6 (2.2%) 4 (6.3%)
None 0 (0%) 62 (68.1%) 23 (71.9%) 62 (74.7%) 27 (87.1%) 124 (46.4%) 50 (79.4%)
Rifampicin 0 (0%) 0 (0%) 0 (0%) 1 (1.2%) 0 (0%) 1 (0.4%) 0 (0%)
Missing 93 (100%) 24 (26.4%) 6 (18.8%) 18 (21.7%) 2 (6.5%) 135 (50.6%) 8 (12.7%)
Heart_Bypass
No 88 (94.6%) 0 (0%) 0 (0%) 7 (8.4%) 4 (12.9%) 95 (35.6%) 4 (6.3%)
Yes 4 (4.3%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 4 (1.5%) 0 (0%)
Missing 1 (1.1%) 91 (100%) 32 (100%) 76 (91.6%) 27 (87.1%) 168 (62.9%) 59 (93.7%)
Eye_Cataract
No 86 (92.5%) 0 (0%) 0 (0%) 6 (7.2%) 4 (12.9%) 92 (34.5%) 4 (6.3%)
Yes 7 (7.5%) 0 (0%) 0 (0%) 1 (1.2%) 0 (0%) 8 (3.0%) 0 (0%)
Missing 0 (0%) 91 (100%) 32 (100%) 76 (91.6%) 27 (87.1%) 167 (62.5%) 59 (93.7%)
Eye_Glaucoma
No 91 (97.8%) 0 (0%) 0 (0%) 7 (8.4%) 4 (12.9%) 98 (36.7%) 4 (6.3%)
Unknown 1 (1.1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (0.4%) 0 (0%)
Yes 1 (1.1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 1 (0.4%) 0 (0%)
Missing 0 (0%) 91 (100%) 32 (100%) 76 (91.6%) 27 (87.1%) 167 (62.5%) 59 (93.7%)
Eye_Blind
No 87 (93.5%) 0 (0%) 0 (0%) 7 (8.4%) 4 (12.9%) 94 (35.2%) 4 (6.3%)
Yes 6 (6.5%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 6 (2.2%) 0 (0%)
Missing 0 (0%) 91 (100%) 32 (100%) 76 (91.6%) 27 (87.1%) 167 (62.5%) 59 (93.7%)
Eye_Sight
No 61 (65.6%) 0 (0%) 0 (0%) 4 (4.8%) 3 (9.7%) 65 (24.3%) 3 (4.8%)
Yes 32 (34.4%) 0 (0%) 0 (0%) 3 (3.6%) 1 (3.2%) 35 (13.1%) 1 (1.6%)
Missing 0 (0%) 91 (100%) 32 (100%) 76 (91.6%) 27 (87.1%) 167 (62.5%) 59 (93.7%)
Smoker
No 72 (77.4%) 77 (84.6%) 28 (87.5%) 73 (88.0%) 27 (87.1%) 222 (83.1%) 55 (87.3%)
Yes 21 (22.6%) 14 (15.4%) 4 (12.5%) 10 (12.0%) 4 (12.9%) 45 (16.9%) 8 (12.7%)

Statistical analysis and volcano plots

For all comparisons of interest, I first identified the largest absolute log2FC and the smallest p-value for setting the axis limits across all plots.

Code
# Preparing output vectors.
max_l2FC <- 1
min_p <- 1

max_l2FC <- max(max_l2FC, 
                nightingale_qc_final |> 
                  filter(Timepoint == "Baseline",
                         Phenotype %in% c("DM", "TB"),
                         !is.na(Concentration)) |> 
                  group_by(Biomarker, Phenotype) |> 
                  summarize(Mean_Conc = mean(Concentration)) |> 
                  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
                  mutate(log2FC = log2(`TB` / `DM`)) |> 
                  arrange(desc(abs(log2FC))) |> 
                  ungroup() |> 
                  slice_head(n = 1) |> 
                  pull(log2FC) |> 
                  abs(), 
                nightingale_qc_final |> 
                  filter(Timepoint == "Baseline",
                         Phenotype %in% c("TB-DM", "TB"),
                         !is.na(Concentration)) |> 
                  group_by(Biomarker, Phenotype) |> 
                  summarize(Mean_Conc = mean(Concentration)) |> 
                  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
                  mutate(log2FC = log2(`TB` / `TB-DM`)) |> 
                  arrange(desc(abs(log2FC))) |> 
                  ungroup() |> 
                  slice_head(n = 1) |> 
                  pull(log2FC) |> 
                  abs(), 
                nightingale_qc_final |> 
                  filter(Timepoint == "Baseline",
                         Phenotype %in% c("TB-DM", "DM"),
                         !is.na(Concentration)) |> 
                  group_by(Biomarker, Phenotype) |> 
                  summarize(Mean_Conc = mean(Concentration)) |> 
                  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
                  mutate(log2FC = log2(`TB-DM` / `DM`)) |> 
                  arrange(desc(abs(log2FC))) |> 
                  ungroup() |> 
                  slice_head(n = 1) |> 
                  pull(log2FC) |> 
                  abs(), 
                nightingale_qc_final |> 
                  filter(Timepoint == "Month_2",
                         Phenotype %in% c("TB-DM", "TB"),
                         !is.na(Concentration)) |> 
                  group_by(Biomarker, Phenotype) |> 
                  summarize(Mean_Conc = mean(Concentration)) |> 
                  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
                  mutate(log2FC = log2(`TB` / `TB-DM`)) |> 
                  arrange(desc(abs(log2FC))) |> 
                  ungroup() |> 
                  slice_head(n = 1) |> 
                  pull(log2FC) |> 
                  abs(),
                nightingale_qc_final |> 
                  filter(Phenotype== "TB",
                         !is.na(Concentration)) |> 
                  left_join(nightingale_qc_final |> 
                  filter(Phenotype== "TB",
                         !is.na(Concentration)) |> 
                  select(PatientID, Biomarker, Timepoint) |> 
                  unique() |> 
                  group_by(PatientID, Biomarker) |> 
                  summarize(N = n()) |> 
                  filter(N == 2), by = c("PatientID", "Biomarker")) |> 
                  filter(N == 2) |> 
                  select(-N) |> 
                  group_by(Biomarker, Timepoint) |> 
                  summarize(Mean_Conc = mean(Concentration)) |> 
                  pivot_wider(names_from = "Timepoint", values_from = "Mean_Conc") |> 
                  mutate(log2FC = log2(`Month_2` / `Baseline`)) |> 
                  arrange(desc(abs(log2FC))) |> 
                  ungroup() |> 
                  slice_head(n = 1) |> 
                  pull(log2FC) |> 
                  abs(),
                nightingale_qc_final |> 
                  filter(Phenotype== "TB-DM",
                         !is.na(Concentration)) |> 
                  left_join(nightingale_qc_final |> 
                  filter(Phenotype== "TB-DM",
                         !is.na(Concentration)) |> 
                  select(PatientID, Biomarker, Timepoint) |> 
                  unique() |> 
                  group_by(PatientID, Biomarker) |> 
                  summarize(N = n()) |> 
                  filter(N == 2), by = c("PatientID", "Biomarker")) |> 
                  filter(N == 2) |> 
                  select(-N) |> 
                  group_by(Biomarker, Timepoint) |> 
                  summarize(Mean_Conc = mean(Concentration)) |> 
                  pivot_wider(names_from = "Timepoint", values_from = "Mean_Conc") |> 
                  mutate(log2FC = log2(`Month_2` / `Baseline`)) |> 
                  arrange(desc(abs(log2FC))) |> 
                  ungroup() |> 
                  slice_head(n = 1) |> 
                  pull(log2FC) |> 
                  abs())

min_p <- min(min_p, 
             nightingale_qc_final |>
               filter(Timepoint == "Baseline",
                      Phenotype %in% c("DM", "TB"),
                      !is.na(Concentration)) |> 
               group_by(Biomarker) |> 
               summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                                      broom::tidy())) |> 
               unnest(c(out)) |> 
               arrange(p.value) |> 
               ungroup() |> 
               slice_head(n = 1) |> 
               pull(p.value), 
             nightingale_qc_final |>
               filter(Timepoint == "Baseline",
                      Phenotype %in% c("TB-DM", "TB"),
                      !is.na(Concentration)) |> 
               group_by(Biomarker) |> 
               summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                                      broom::tidy())) |> 
               unnest(c(out)) |> 
               arrange(p.value) |> 
               ungroup() |> 
               slice_head(n = 1) |> 
               pull(p.value), 
             nightingale_qc_final |>
               filter(Timepoint == "Baseline",
                      Phenotype %in% c("DM", "TB-DM"),
                      !is.na(Concentration)) |> 
               group_by(Biomarker) |> 
               summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                                      broom::tidy())) |> 
               unnest(c(out)) |> 
               arrange(p.value) |> 
               ungroup() |> 
               slice_head(n = 1) |> 
               pull(p.value), 
             nightingale_qc_final |>
               filter(Timepoint == "Month_2",
                      Phenotype %in% c("TB-DM", "TB"),
                      !is.na(Concentration)) |> 
               group_by(Biomarker) |> 
               summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                                      broom::tidy())) |> 
               unnest(c(out)) |> 
               arrange(p.value) |> 
               ungroup() |> 
               slice_head(n = 1) |> 
               pull(p.value),
             nightingale_qc_final |> 
               filter(Phenotype== "TB",
                      !is.na(Concentration)) |> 
               left_join(nightingale_qc_final |> 
               filter(Phenotype== "TB",
                      !is.na(Concentration)) |> 
               select(PatientID, Biomarker, Timepoint) |> 
               unique() |> 
               group_by(PatientID, Biomarker) |> 
               summarize(N = n()) |> 
               filter(N == 2), by = c("PatientID", "Biomarker")) |> 
               filter(N == 2) |> 
               select(-N) |> 
               group_by(Biomarker) |> 
               summarize(out = list(pairwise.wilcox.test(Concentration, Timepoint, paired = TRUE, p.adjust.method = "none") |> 
                                      broom::tidy())) |> 
               unnest(c(out)) |> 
               arrange(p.value) |> 
               ungroup() |> 
               slice_head(n = 1) |> 
               pull(p.value),
             nightingale_qc_final |> 
               filter(Phenotype== "TB-DM",
                      !is.na(Concentration)) |> 
               left_join(nightingale_qc_final |> 
               filter(Phenotype== "TB-DM",
                      !is.na(Concentration)) |> 
               select(PatientID, Biomarker, Timepoint) |> 
               unique() |> 
               group_by(PatientID, Biomarker) |> 
               summarize(N = n()) |> 
               filter(N == 2), by = c("PatientID", "Biomarker")) |> 
               filter(N == 2) |> 
               select(-N) |> 
               group_by(Biomarker) |> 
               summarize(out = list(pairwise.wilcox.test(Concentration, Timepoint, paired = TRUE, p.adjust.method = "none") |> 
                                      broom::tidy())) |> 
               unnest(c(out)) |> 
               arrange(p.value) |> 
               ungroup() |> 
               slice_head(n = 1) |> 
               pull(p.value))

Baseline: TB vs DM

Code
# Labeling comparison.
comparison <- "Baseline: TB vs DM"

# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Timepoint == "Baseline",
         Phenotype %in% c("DM", "TB"),
         !is.na(Concentration))

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Phenotype) |> 
  summarize(Mean_Conc = mean(Concentration)) |> 
  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`TB` / `DM`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))

# Plotting volcano plot for all biomarkers.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(!Biomarker %in% c(lipoprotein_subclasses, ratios)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all lipid biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(lipoproteins, fatty_acids, other_lipids)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_TB_vs_DM.png"), height = 7, width = 7, units = "in")

# Plotting volcano plot for Julias selection.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(Julias_selection)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("red2", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_TB_vs_DM_Julias_selection.svg"), height = 4, width = 4, units = "in")

NG_TB_vs_DM <- tmp 

Baseline: TB-DM vs DM

Code
# Labeling comparison.
comparison <- "Baseline: TB-DM vs DM"

# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Timepoint == "Baseline",
         Phenotype %in% c("DM", "TB-DM"),
         !is.na(Concentration))

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Phenotype) |> 
  summarize(Mean_Conc = mean(Concentration)) |> 
  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`TB-DM` / `DM`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))



# Plotting volcano plot for all biomarkers.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(!Biomarker %in% c(lipoprotein_subclasses, ratios)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all lipid biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(lipoproteins, fatty_acids, other_lipids)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all lipid biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(lipoproteins, fatty_acids, other_lipids)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3, max.overlaps = 20) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
### Plotting volcano plot for Julias selection.

# Labeling comparison.
comparison <- "Baseline: TB-DM vs DM"

# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Timepoint == "Baseline",
         Phenotype %in% c("DM", "TB-DM"),
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection)) # added 

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Phenotype) |> 
  summarize(Mean_Conc = mean(Concentration)) |> 
  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`TB-DM` / `DM`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))


tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(Julias_selection)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)




# plot with making labels infinite and thinner
tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = subset(tmp, signif == "Significant"), # Only label significant dots
                           aes(label = Biomarker), 
                           size = 2.5,           # Reduce label size
                           max.overlaps = Inf,   # Allow all overlaps for significant dots
                           nudge_x = 0.1,        # Adjust x position for better spacing
                           nudge_y = 0.1,        # Adjust y position for better spacing
                           segment.size = 0.2) + # Make connecting lines thinner
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("red2", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_TBDM_vs_DM_Julias_selection.svg"), height = 4, width = 4, units = "in")

NG_TBDM_vs_DM <- tmp



### Labeling ApoB in Julias selection. 
tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  
  # Plot all points, except ApoB
  geom_point(data = subset(tmp, Biomarker != "ApoB"), aes(color = signif), show.legend = F) +
  
  # Plot ApoB dot in blue
  geom_point(data = subset(tmp, Biomarker == "ApoB"), color = "blue", size = 2) +
  
  # Label only ApoB
  ggrepel::geom_text_repel(data = subset(tmp, Biomarker == "ApoB"),
                           aes(label = Biomarker), 
                           size = 3, 
                           nudge_x = 0.1, 
                           nudge_y = 0.1, 
                           segment.size = 0.2) +
  
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("red2", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Baseline: TB-DM vs TB

Code
# Labeling comparison.
comparison <- "Baseline: TB-DM vs TB"

# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Timepoint == "Baseline",
         Phenotype %in% c("TB", "TB-DM"),
         !is.na(Concentration))

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Phenotype) |> 
  summarize(Mean_Conc = mean(Concentration)) |> 
  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`TB-DM` / `TB`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))

# Plotting volcano plot for all biomarkers.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(!Biomarker %in% c(lipoprotein_subclasses, ratios)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all lipid biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(lipoproteins, fatty_acids, other_lipids)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for Julias selection.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(Julias_selection)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("red2", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_TBDM_vs_TB_Julias_selection.svg"), height = 4, width = 4, units = "in")

NG_TBDM_vs_TB <- tmp

Month 2: TB-DM vs TB

Code
# Labeling comparison.
comparison <- "Month 2: TB-DM vs TB"

# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Timepoint == "Month_2",
         Phenotype %in% c("TB", "TB-DM"),
         !is.na(Concentration))

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Phenotype) |> 
  summarize(Mean_Conc = mean(Concentration)) |> 
  pivot_wider(names_from = "Phenotype", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`TB-DM` / `TB`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Phenotype, paired = FALSE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))

# Plotting volcano plot for all biomarkers.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(!Biomarker %in% c(lipoprotein_subclasses, ratios)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all lipid biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(lipoproteins, fatty_acids, other_lipids)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for Julia's selection.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(Julias_selection)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("red2", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_TBDM_vs_TB_month2_Julias_selection.svg"), height = 4, width = 4, units = "in")

TB: Month 2 vs Baseline

Code
# Labeling comparison.
comparison <- "TB: Month 2 vs Baseline"

# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Phenotype== "TB",
         !is.na(Concentration))

# Filtering to exclude measurements of biomarkers that didn't have both timepoints for a patient.
both_timepoint <- filtered_data |> 
  select(PatientID, Biomarker, Timepoint) |> 
  unique() |> 
  group_by(PatientID, Biomarker) |> 
  summarize(N = n()) |> 
  filter(N == 2)

filtered_data <- filtered_data |> 
  left_join(both_timepoint, by = c("PatientID", "Biomarker")) |> 
  filter(N == 2) |> 
  select(-N)

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Timepoint) |> 
  summarize(Mean_Conc = mean(Concentration)) |> 
  pivot_wider(names_from = "Timepoint", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`Month_2` / `Baseline`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Timepoint, paired = TRUE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))

# Plotting volcano plot for all biomarkers.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(!Biomarker %in% c(lipoprotein_subclasses, ratios)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all lipid biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(lipoproteins, fatty_acids, other_lipids)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("red2", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for Julia's selection.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(Julias_selection)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("red2", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_TB_Baseline_vs_month2_Julias_selection.svg"), height = 4, width = 4, units = "in")

TB-DM: Month 2 vs Baseline

Code
# Labeling comparison.
comparison <- "TB-DM: Month 2 vs Baseline"

# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Phenotype== "TB-DM",
         !is.na(Concentration))

# Filtering to exclude measurements of biomarkers that didn't have both timepoints for a patient.
both_timepoint <- filtered_data |> 
  select(PatientID, Biomarker, Timepoint) |> 
  unique() |> 
  group_by(PatientID, Biomarker) |> 
  summarize(N = n()) |> 
  filter(N == 2)

filtered_data <- filtered_data |> 
  left_join(both_timepoint, by = c("PatientID", "Biomarker")) |> 
  filter(N == 2) |> 
  select(-N)

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Timepoint) |> 
  summarize(Mean_Conc = mean(Concentration)) |> 
  pivot_wider(names_from = "Timepoint", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`Month_2` / `Baseline`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Timepoint, paired = TRUE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))

# Plotting volcano plot for all biomarkers.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(!Biomarker %in% c(lipoprotein_subclasses, ratios)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for all lipid biomarkers excluding subclasses.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(lipoproteins, fatty_acids, other_lipids)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3, max.overlaps = Inf) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("darkred", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# Plotting volcano plot for Julias selection.
tmp <- log2FC_df |> 
  left_join(p_values, by = "Biomarker") |> 
  select(-(group1:group2)) |> 
  filter(Biomarker %in% c(Julias_selection)) |> 
  ungroup() |>
  mutate(p.adj = p.adjust(p.value, method = "fdr"),
         `-log10P` = -log10(p.adj),
         signif = case_when(p.adj < 0.05 ~ "Significant",
                            TRUE ~ "Non-significant"))

labels <- tmp |> 
  filter(p.adj < 0.05)

tmp |> 
  ggplot(aes(x = log2FC, y = `-log10P`)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  geom_point(aes(color = signif), show.legend = F) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3) +
  scale_y_continuous(limits = c(0, -log10(min_p)), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-max_l2FC, max_l2FC)) +
  scale_color_discrete(limits = c("Significant", "Non-significant"), type = c("red2", "gray")) +
  labs(title = comparison,
       y = "-log10(adjusted p-value)",
       x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_TBDM_Baseline_vs_month2_Julias_selection.svg"), height = 4, width = 4, units = "in")

Month 2 vs baseline TB-DM vs TB

Code
# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Phenotype== "TB",
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection))

# Filtering to exclude measurements of biomarkers that didn't have both timepoints for a patient.
both_timepoint <- filtered_data |> 
  select(PatientID, Biomarker, Timepoint) |> 
  unique() |> 
  group_by(PatientID, Biomarker) |> 
  summarize(N = n()) |> 
  filter(N == 2)

filtered_data <- filtered_data |> 
  left_join(both_timepoint, by = c("PatientID", "Biomarker")) |> 
  filter(N == 2) |> 
  select(-N)

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Timepoint) |> 
  summarize(Mean_Conc = median(Concentration)) |> 
  pivot_wider(names_from = "Timepoint", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`Month_2` / `Baseline`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Timepoint, paired = TRUE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))

TB_res <- log2FC_df |> 
  left_join(p_values |> select(Biomarker, p.value), by = "Biomarker") |> 
  mutate(Group = "TB")




# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Phenotype== "TB-DM",
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection))

# Filtering to exclude measurements of biomarkers that didn't have both timepoints for a patient.
both_timepoint <- filtered_data |> 
  select(PatientID, Biomarker, Timepoint) |> 
  unique() |> 
  group_by(PatientID, Biomarker) |> 
  summarize(N = n()) |> 
  filter(N == 2)

filtered_data <- filtered_data |> 
  left_join(both_timepoint, by = c("PatientID", "Biomarker")) |> 
  filter(N == 2) |> 
  select(-N)

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Timepoint) |> 
  summarize(Mean_Conc = mean(Concentration)) |> 
  pivot_wider(names_from = "Timepoint", values_from = "Mean_Conc") |> 
  mutate(log2FC = log2(`Month_2` / `Baseline`))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Timepoint, paired = TRUE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))

TB_DM_res <- log2FC_df |> 
  left_join(p_values |> select(Biomarker, p.value), by = "Biomarker") |> 
  mutate(Group = "TB-DM")

combined_res <- rbind(TB_res, TB_DM_res)

tmp <- combined_res |> 
  select(Biomarker, log2FC, p.value, Group) |> 
  pivot_wider(names_from = Group, values_from = c(log2FC, p.value)) |> 
  ungroup() |> # you need to ungroup to correct for all biomarkers and not within one biomarker. 
  mutate(p.adj_TB = p.adjust(p.value_TB, method = "fdr"),
         `p.adj_TB-DM` = p.adjust(`p.value_TB-DM`, method = "fdr"),
         Significance = case_when(p.adj_TB < 0.05 & `p.adj_TB-DM` < 0.05 ~ "Significant in both",
                                  p.adj_TB < 0.05 & `p.adj_TB-DM` > 0.05 ~ "Significant in TB",
                                  p.adj_TB > 0.05 & `p.adj_TB-DM` < 0.05 ~ "Significant in TB-DM",
                                  TRUE ~ "Not significant"))

labels <- tmp |> 
  filter(Significance != "Not significant")

tmp |> 
  ggplot(aes(x = log2FC_TB, y = `log2FC_TB-DM`, color = Significance)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(slope = 1, color = "gray") +
  geom_point(size = 2, alpha = 0.7) +
  ggrepel::geom_text_repel(data = labels, aes(label = Biomarker), size = 3, max.overlaps = 20, show.legend = F) +
  scale_x_continuous(limits = c(-max(abs(combined_res$log2FC)), max(abs(combined_res$log2FC)))) +
  scale_y_continuous(limits = c(-max(abs(combined_res$log2FC)), max(abs(combined_res$log2FC)))) +
  scale_x_continuous(limits = c(-0.75, 0.75)) +  
  scale_y_continuous(limits = c(-0.75, 0.75)) +
  scale_color_discrete(type = c("gray", "green4", "blue3", "deeppink3")) +
  labs(x = "TB\nlog2FC",
       y = "TB-DM\nlog2FC",
       title = "Month 2 vs Baseline") +
  theme_bw() +
  theme(legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

Code
ggsave(filename = path(path1, "output/Nightingale_comined_Volcano_TB_vs_TBDM_timepoints.svg"), height = 5, width = 6, units = "in")

Volcano co-variates

For the supplementary figures, the volcano plot results are corrected for potential co-variates age and sex.

Code
# T0 - TB versus DM
filtered_data <- nightingale_qc_final |>
  filter(Timepoint == "Baseline",
         Phenotype %in% c("TB", "DM"),
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection)) |>
select(SampleID, Phenotype, Biomarker, Concentration, Age, Sex) |>
  pivot_wider(names_from = Biomarker, values_from = Concentration)

dat_de_pr <- filtered_data$Phenotype
Phenotype <- dat_de_pr
Age <- filtered_data$Age
Sex <- filtered_data$Sex

design_cov <- model.matrix(~ Phenotype + Age + Sex)
colnames(design_cov) <- c("Intercept","Phenotype", "Age", "Sex")

dat_de <- filtered_data |>
  select(!(1:4))

dat_de_tr <- as.matrix(t(dat_de))
fit1_cov <- lmFit(dat_de_tr, design_cov)
cont.matrix_cov <- makeContrasts("Phenotype", levels = design_cov)
fit2_cov <- contrasts.fit(fit1_cov,cont.matrix_cov)
fit2_cov <- eBayes(fit2_cov)
DEP_cov_3 <- topTable(fit2_cov, coef = "Phenotype", adjust = "BH", number = ncol(dat_de))
#results_cov <- decideTests(fit2_cov)
#summary(results_cov)

# make a new column called "protein" from rownames 
DEP_cov_3 <- DEP_cov_3 %>% mutate(protein = rownames(DEP_cov_3))

# Volcano controlling for covariates sex and age
max.overlaps=Inf
ggplot(DEP_cov_3, aes(x = logFC, y = -log10(adj.P.Val),label = protein)) +
  geom_text_repel(data = subset(DEP_cov_3,adj.P.Val < 0.05 ), size = 3) +
  geom_point(color = ifelse(DEP_cov_3$adj.P.Val > 0.05, "black", "red"), size = 1.5) +
  scale_y_continuous(limits = c(0, 25), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-7, 3.6)) +
  theme_bw() +
  xlab("log2FC") +
  ylab("-log10(adjusted p-value)") +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  labs(title = "Baseline: TB vs DM",
       y = "-log10(adjusted p-value)", x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_Baseline_DM_TB_corrected.svg"), height = 4, width = 4, units = "in")




# T0 - TB-DM versus DM
filtered_data <- nightingale_qc_final |>
  filter(Timepoint == "Baseline",
         Phenotype %in% c("TB-DM", "DM"),
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection)) |>
select(SampleID, Phenotype, Biomarker, Concentration, Age, Sex) |>
  pivot_wider(names_from = Biomarker, values_from = Concentration)

dat_de_pr <- filtered_data$Phenotype
Phenotype <- dat_de_pr
Age <- filtered_data$Age
Sex <- filtered_data$Sex

design_cov <- model.matrix(~ Phenotype + Age + Sex)
colnames(design_cov) <- c("Intercept","Phenotype", "Age", "Sex")

dat_de <- filtered_data |>
  select(!(1:4))

dat_de_tr <- as.matrix(t(dat_de))
fit1_cov <- lmFit(dat_de_tr, design_cov)
cont.matrix_cov <- makeContrasts("Phenotype", levels = design_cov)
fit2_cov <- contrasts.fit(fit1_cov,cont.matrix_cov)
fit2_cov <- eBayes(fit2_cov)
DEP_cov_3 <- topTable(fit2_cov, coef = "Phenotype", adjust = "BH", number = ncol(dat_de))
#results_cov <- decideTests(fit2_cov)
#summary(results_cov)

# make a new column called "protein" from rownames 
DEP_cov_3 <- DEP_cov_3 %>% mutate(protein = rownames(DEP_cov_3))

# Volcano controlling for covariates sex and age
max.overlaps=Inf
ggplot(DEP_cov_3, aes(x = logFC, y = -log10(adj.P.Val),label = protein)) +
  geom_text_repel(data = subset(DEP_cov_3,adj.P.Val < 0.05 ), size = 3) +
  geom_point(color = ifelse(DEP_cov_3$adj.P.Val > 0.05, "black", "red"), size = 1.5) +
  scale_y_continuous(limits = c(0, 25), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-3.6, 3.6)) +
  theme_bw() +
  xlab("log2FC") +
  ylab("-log10(adjusted p-value)") +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  labs(title = "Baseline: TB-DM vs DM",
       y = "-log10(adjusted p-value)", x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
# labeling all significant dots
ggplot(DEP_cov_3, aes(x = logFC, y = -log10(adj.P.Val), label = protein)) +
  geom_point(color = ifelse(DEP_cov_3$adj.P.Val > 0.05, "black", "red"), size = 1.5) +
  ggrepel::geom_text_repel(data = subset(DEP_cov_3, adj.P.Val < 0.05),  # Only label significant points
                           size = 3,                                    # Same label size
                           segment.size = 0.2,                          # Thinner connecting lines
                           max.overlaps = Inf) +                        # Allow all overlaps for significant points
  scale_y_continuous(limits = c(0, 25), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-3.6, 3.6)) +
  theme_bw() +
  xlab("log2FC") +
  ylab("-log10(adjusted p-value)") +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  labs(title = "Baseline: TB-DM vs DM",
       y = "-log10(adjusted p-value)", x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_Baseline_DM_TBDM_corrected.svg"), height = 4, width = 4, units = "in")




# T0 - TB-DM versus TB
filtered_data <- nightingale_qc_final |>
  filter(Timepoint == "Baseline",
         Phenotype %in% c("TB-DM", "TB"),
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection)) |>
select(SampleID, Phenotype, Biomarker, Concentration, Age, Sex) |>
  pivot_wider(names_from = Biomarker, values_from = Concentration)

dat_de_pr <- filtered_data$Phenotype
Phenotype <- dat_de_pr
Age <- filtered_data$Age
Sex <- filtered_data$Sex

design_cov <- model.matrix(~ Phenotype + Age + Sex)
colnames(design_cov) <- c("Intercept","Phenotype", "Age", "Sex")

dat_de <- filtered_data |>
  select(!(1:4))

dat_de_tr <- as.matrix(t(dat_de))
fit1_cov <- lmFit(dat_de_tr, design_cov)
cont.matrix_cov <- makeContrasts("Phenotype", levels = design_cov)
fit2_cov <- contrasts.fit(fit1_cov,cont.matrix_cov)
fit2_cov <- eBayes(fit2_cov)
DEP_cov_3 <- topTable(fit2_cov, coef = "Phenotype", adjust = "BH", number = ncol(dat_de))
#results_cov <- decideTests(fit2_cov)
#summary(results_cov)

# make a new column called "protein" from rownames 
DEP_cov_3 <- DEP_cov_3 %>% mutate(protein = rownames(DEP_cov_3))

# Volcano controlling for covariates sex and age
max.overlaps=Inf
ggplot(DEP_cov_3, aes(x = logFC, y = -log10(adj.P.Val),label = protein)) +
  geom_text_repel(data = subset(DEP_cov_3,adj.P.Val < 0.05 ), size = 3) +
  geom_point(color = ifelse(DEP_cov_3$adj.P.Val > 0.05, "black", "red"), size = 1.5) +
  scale_y_continuous(limits = c(0, 25), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-3.6, 9.4)) +
  theme_bw() +
  xlab("log2FC") +
  ylab("-log10(adjusted p-value)") +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  labs(title = "Baseline: TB-DM vs TB",
       y = "-log10(adjusted p-value)", x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_Baseline_TBDM_TB_corrected.svg"), height = 4, width = 4, units = "in")




#  Month 2 - TB-DM versus TB
filtered_data <- nightingale_qc_final |>
  filter(Timepoint == "Month_2",
         Phenotype %in% c("TB-DM", "TB"),
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection)) |>
select(SampleID, Phenotype, Biomarker, Concentration, Age, Sex) |>
  pivot_wider(names_from = Biomarker, values_from = Concentration)

dat_de_pr <- filtered_data$Phenotype
Phenotype <- dat_de_pr
Age <- filtered_data$Age
Sex <- filtered_data$Sex

design_cov <- model.matrix(~ Phenotype + Age + Sex)
colnames(design_cov) <- c("Intercept","Phenotype", "Age", "Sex")

dat_de <- filtered_data |>
  select(!(1:4))

dat_de_tr <- as.matrix(t(dat_de))
fit1_cov <- lmFit(dat_de_tr, design_cov)
cont.matrix_cov <- makeContrasts("Phenotype", levels = design_cov)
fit2_cov <- contrasts.fit(fit1_cov,cont.matrix_cov)
fit2_cov <- eBayes(fit2_cov)
DEP_cov_3 <- topTable(fit2_cov, coef = "Phenotype", adjust = "BH", number = ncol(dat_de))
#results_cov <- decideTests(fit2_cov)
#summary(results_cov)

# make a new column called "protein" from rownames 
DEP_cov_3 <- DEP_cov_3 %>% mutate(protein = rownames(DEP_cov_3))

# Volcano controlling for covariates sex and age
max.overlaps=Inf
ggplot(DEP_cov_3, aes(x = logFC, y = -log10(adj.P.Val),label = protein)) +
  geom_text_repel(data = subset(DEP_cov_3,adj.P.Val < 0.05 ), size = 3) +
  geom_point(color = ifelse(DEP_cov_3$adj.P.Val > 0.05, "black", "red"), size = 1.5) +
  scale_y_continuous(limits = c(0, 25), expand = expansion(mult = c(0, 0.05))) +
  scale_x_continuous(limits = c(-3.6, 10)) +
  theme_bw() +
  xlab("log2FC") +
  ylab("-log10(adjusted p-value)") +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = -log10(0.05), alpha = 0.5, color = "darkred", linetype = 2) +
  labs(title = "Month 2: TB-DM vs TB",
       y = "-log10(adjusted p-value)", x = "log2FC") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Volcano_Month2_TBDM_TB_corrected.svg"), height = 4, width = 4, units = "in")

Lineplot

Code
# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Phenotype== "TB",
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection))

# Filtering to exclude measurements of biomarkers that didn't have both timepoints for a patient.
both_timepoint <- filtered_data |> 
  select(PatientID, Biomarker, Timepoint) |> 
  unique() |> 
  group_by(PatientID, Biomarker) |> 
  summarize(N = n()) |> 
  filter(N == 2)

filtered_data <- filtered_data |> 
  left_join(both_timepoint, by = c("PatientID", "Biomarker")) |> 
  filter(N == 2) |> 
  select(-N)

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Timepoint) |> 
  summarize(Median_Concentration = median(Concentration)) |> 
  pivot_wider(names_from = "Timepoint", values_from = "Median_Concentration") |> 
  mutate(log2FC = log2(Month_2 / Baseline))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Timepoint, paired = TRUE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))

TB_res <- log2FC_df |> 
  left_join(p_values |> select(Biomarker, p.value), by = "Biomarker") |> 
  mutate(Group = "TB")

# Preparing data.
filtered_data <- nightingale_qc_final |> 
  filter(Phenotype== "TB-DM",
         !is.na(Concentration),
         Biomarker %in% c(Julias_selection))

# Filtering to exclude measurements of biomarkers that didn't have both timepoints for a patient.
both_timepoint <- filtered_data |> 
  select(PatientID, Biomarker, Timepoint) |> 
  unique() |> 
  group_by(PatientID, Biomarker) |> 
  summarize(N = n()) |> 
  filter(N == 2)

filtered_data <- filtered_data |> 
  left_join(both_timepoint, by = c("PatientID", "Biomarker")) |> 
  filter(N == 2) |> 
  select(-N)

# Calculating log2FC for each metabolite.
log2FC_df <- filtered_data |> 
  group_by(Biomarker, Timepoint) |> 
  summarize(Median_Concentration = median(Concentration)) |> 
  pivot_wider(names_from = "Timepoint", values_from = "Median_Concentration") |> 
  mutate(log2FC = log2(Month_2 / Baseline))

# Running Wilcoxon signed rank test on each metabolite between timepoints.
p_values <- filtered_data |> 
  group_by(Biomarker) |> 
  summarize(out = list(pairwise.wilcox.test(Concentration, Timepoint, paired = TRUE, p.adjust.method = "none") |> 
                         broom::tidy())) |> 
  unnest(c(out))



TB_DM_res <- log2FC_df |> 
  left_join(p_values |> select(Biomarker, p.value), by = "Biomarker") |> 
  mutate(Group = "TB-DM")

combined_res <- rbind(TB_res, TB_DM_res)
  

tmp <- nightingale_qc_final |> 
  select(PatientID, Timepoint) |> 
  unique() |> 
  group_by(PatientID) |> 
  summarize(N = n()) |> 
  filter(N == 2) |>
  pull(PatientID)

tmp2 <- nightingale_qc_final %>%
  filter(PatientID %in% tmp) |> 
  group_by(Biomarker, Phenotype) |> 
  summarize(y.position = max(Concentration))

stat_join <- combined_res |> 
  ungroup() |> # you need to ungroup to correct for all biomarkers and not within one biomarker.
  select(Biomarker, p.value, Phenotype = Group) |> 
  pivot_wider(names_from = Phenotype, values_from = p.value) |> 
  mutate(TB = p.adjust(TB, method = "fdr"),
         `TB-DM` = p.adjust(`TB-DM`, method = "fdr")) |> 
  pivot_longer(cols = !Biomarker, names_to = "Phenotype", values_to = "p.adj") |> 
  mutate(p.signif = case_when(p.adj > 0.05 ~ "",
                              p.adj < 0.0001 ~ "****",
                              p.adj < 0.001 ~ "***",
                              p.adj < 0.01 ~ "**",
                              p.adj < 0.05 ~ "*"),
         group1 = "Baseline",
         group2 = "Month_2") |> 
  left_join(tmp2, by = c("Biomarker", "Phenotype"))

 
selected_biomarkers <- c("LDL-C", "HDL-C", "ApoB", "HDL-TG", "GlycA")

nightingale_qc_final %>%
  filter(Biomarker %in% selected_biomarkers,
         (PatientID %in% tmp | Phenotype == "DM")) |>
  mutate(Timepoint = factor(Timepoint, levels = c("Baseline", "Month_2"), labels = c("Baseline", "Month 2"))) %>%
  ggpubr::ggpaired(x = "Timepoint",
           y = "Concentration",
           id = "PatientID",
           color = "Phenotype",
           facet.by = c("Biomarker", "Phenotype"),
           scales = "free", #removes month 2 from diabetes
           width = 0.4,
           line.color = "lightgrey",
           line.alpha = 0.05,
           line.size = 0.4) +
  scale_color_manual(values = c("goldenrod", "blue4", "deeppink3")) +
  #facet_wrap(cytokines ~ Phenotype, scales = "free_x") +
  stat_summary(fun = median, geom = "line", aes(group = 1), linewidth = 1, color = "black") +
  #stat_summary(fun = median, geom = "text", aes(group = 1, label = after_stat(sprintf("%.1f", y))), hjust = 1.5, vjust = -1, color = "black") +

  # add pvalue from a data frame called "stat_join" that have a column called "cytokines" with the name of the protein as the value
  # and a column  called "asterisk" that show the asterisk. you can change it to a column that consists of the P.value.

  ggprism::add_pvalue(stat_join %>% filter(Biomarker %in% selected_biomarkers),
                      label = "p.signif",
                       x = 1.5,
                       remove.bracket = TRUE,
                       label.size = 7) +
  scale_y_continuous(expand = expansion(mult = c(0.1, 0.2)))+
  labs(y = "Concentration") +
  ggpubr::theme_classic2() +
  theme(axis.title.x = element_blank())

Code
ggsave(filename = path(path1, "output/Nightingale_Lineplot.svg"), height = 5, width = 6, units = "in")

Z-score based heatmap

Code
z_scores <- nightingale_qc_final |> 
  filter(Timepoint == "Baseline",
         !Biomarker %in% c(lipoprotein_subclasses, ratios)) |> 
  select(SampleID, Phenotype, Biomarker, Concentration) |> 
  pivot_wider(names_from = Biomarker, values_from = Concentration) |> 
  select(-SampleID) |> 
  group_by(Phenotype) |> 
  summarize(across(all_of(c(amino_acids, fatty_acids, glycolysis, ketones, lipoproteins, other, other_lipids)), function(x) median(x, na.rm = T))) |> 
  column_to_rownames("Phenotype") |> 
  as.matrix() |> 
  scale(center = T, scale = T) |> 
  t()

heatmap <- pheatmap::pheatmap(z_scores,
                              cluster_cols = FALSE,
                              fontsize_row = 8,
                              main = "")

Code
ggsave(plot = heatmap, filename = path(path1, "output/Baseline_z_score_Nightingale.svg"), height = 10, width = 3, units = "in")

# Z-score based heatmap for Julia's selection
z_scores <- nightingale_qc_final |> 
  filter(Timepoint == "Baseline",
         !Biomarker %in% c(lipoprotein_subclasses, ratios)) |> 
  select(SampleID, Phenotype, Biomarker, Concentration) |> 
  pivot_wider(names_from = Biomarker, values_from = Concentration) |> 
  select(-SampleID) |> 
  group_by(Phenotype) |> 
  summarize(across(all_of(c(Julias_selection)), function(x) median(x, na.rm = T))) |> 
  column_to_rownames("Phenotype") |> 
  as.matrix() |> 
  scale(center = T, scale = T) |> 
  t()

heatmap <- pheatmap::pheatmap(z_scores,
                              cluster_cols = FALSE,
                              fontsize_row = 8,
                              main = "")

Code
ggsave(plot = heatmap, filename = path(path1, "output/Baseline_z_score_Nightingale_Julias_selection.svg"), height = 7, width = 3, units = "in")

Table medians

Code
tmp <- nightingale_qc_final |> 
  filter(Timepoint == "Baseline",
         Biomarker %in% c(Julias_selection)) |> 
  select(SampleID, Phenotype, Timepoint, Biomarker, Concentration) |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(Median = median(Concentration, na.rm = T)) |> 
  pivot_wider(names_from = Biomarker, values_from = Median) |> 
  t() |> 
  as.data.frame() |> 
  rownames_to_column("Biomarker")
tmp
        Biomarker                   V1                   V2
1       Phenotype                   DM                   TB
2       Timepoint             Baseline             Baseline
3           ApoA1     1.22825892554971     1.00690590767307
4            ApoB    0.777565936234394    0.698726212625704
5  Clinical LDL-C     2.26466927230168     1.65961007835438
6         Glucose     9.67478630692256     4.38573821830714
7           GlycA    0.841061783895516    1.178900856243154
8           HDL-C    1.008800964231591    0.860643866128272
9          HDL-CE    0.782287727844539    0.666277128925908
10         HDL-FC    0.229022883152112    0.195944581806655
11          HDL-L     2.42545574906730     2.02217490257422
12          HDL-P  0.01262795367307050  0.00993059126122916
13         HDL-PL     1.24357421660063     1.05878301562313
14         HDL-TG    0.151333968383988    0.108055072448209
15          LDL-C     1.52421151011821     1.09019303825719
16         LDL-CE    1.110436251436176    0.785261109451974
17         LDL-FC    0.413761637856625    0.313007106100000
18          LDL-L     2.23186641228266     1.69420339640996
19          LDL-P 0.001076978964573927 0.000977596701751672
20         LDL-PL    0.550432221721622    0.429489806069240
21         LDL-TG    0.152511024713898    0.129995155899222
22      Remnant-C     1.39329825826509     1.19569872084661
23        Total-C     3.94555265914166     3.19811660168907
24       Total-CE     2.86879949951823     2.29066324532659
25       Total-FC     1.09625184295302     0.90760684735185
26        Total-L     7.56058107016467     6.19503848690937
27        Total-P   0.0143493105297104   0.0111907342020748
28       Total-PL     2.51379135328935     2.09464230212316
29       Total-TG    1.112295006139023    0.787248604493174
30         VLDL-C    0.641508535868598    0.601190942579161
31        VLDL-CE    0.389156706629375    0.374841406979388
32        VLDL-FC    0.254817287615571    0.222505594478605
33         VLDL-L     1.73593388966681     1.41446902791008
34         VLDL-P 0.000129853779965971 0.000118727863069566
35        VLDL-PL    0.418386187195962    0.357085531206319
36        VLDL-TG    0.670384841989725    0.467985846667001
37      non-HDL-C     2.86638522248820     2.35302848863771
                     V3
1                 TB-DM
2              Baseline
3      1.03878412697022
4     0.830552881038891
5      2.03272048480284
6     13.05988460426618
7     1.205029092170685
8     0.872073457312639
9     0.661498185524856
10    0.208335245650150
11     2.10520375690763
12  0.01025896968132983
13     1.08914494657201
14    0.148657259951922
15     1.38370624794573
16    1.004149683063895
17    0.364982882420742
18     2.06929162313314
19 0.001174448192310274
20    0.518543121604865
21    0.175360894844969
22     1.46210898498680
23     3.68391119332383
24     2.62117394391588
25     1.06517702673480
26     7.24818905373987
27   0.0120518907956562
28     2.37842512924707
29    1.221250317990078
30    0.761739584257257
31    0.463364757490651
32    0.300531317789125
33     2.05488535708013
34 0.000155643129798195
35    0.493701924683064
36    0.730297889582740
37     2.82242022847868
Code
write_csv(tmp, file = path(path1, "output/Nightingale_baseline_all_median.csv"), col_names = F)

# Filtering to exclude measurements of biomarkers that didn't have both timepoints for a patient.
both_timepoint <- nightingale_qc_final |> 
  select(PatientID, Biomarker, Timepoint) |> 
  unique() |> 
  group_by(PatientID, Biomarker) |> 
  summarize(N = n()) |> 
  filter(N == 2)

filtered_data <- nightingale_qc_final |> 
  left_join(both_timepoint, by = c("PatientID", "Biomarker")) |> 
  filter(N == 2) |> 
  select(-N)

tmp <- filtered_data |> 
  rbind(nightingale_qc_final |> filter(Phenotype == "DM")) |> 
  filter(Biomarker %in% c(Julias_selection)) |>
  select(SampleID, Phenotype, Timepoint, Biomarker, Concentration) |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(Median = median(Concentration, na.rm = T)) |> 
  pivot_wider(names_from = Biomarker, values_from = Median) |> 
  t() |> 
  as.data.frame() |> 
  rownames_to_column("Biomarker")
tmp
        Biomarker                   V1                   V2
1       Phenotype                   DM                   TB
2       Timepoint             Baseline             Baseline
3           ApoA1     1.22825892554971     1.00690590767307
4            ApoB    0.777565936234394    0.712468492297136
5  Clinical LDL-C     2.26466927230168     1.84583223456057
6         Glucose     9.67478630692256     4.44601808101682
7           GlycA    0.841061783895516    1.126524556988599
8           HDL-C    1.008800964231591    0.865185678617505
9          HDL-CE    0.782287727844539    0.666277128925908
10         HDL-FC    0.229022883152112    0.197895579292921
11          HDL-L     2.42545574906730     2.02148743996680
12          HDL-P   0.0126279536730705   0.0100608572488226
13         HDL-PL     1.24357421660063     1.03900421294024
14         HDL-TG    0.151333968383988    0.109973499233951
15          LDL-C     1.52421151011821     1.20166360004560
16         LDL-CE    1.110436251436176    0.862485714053993
17         LDL-FC    0.413761637856625    0.339177885991606
18          LDL-L     2.23186641228266     1.79536626282176
19          LDL-P  0.00107697896457393  0.00099192611911126
20         LDL-PL    0.550432221721622    0.460393540845614
21         LDL-TG    0.152511024713898    0.128331578732665
22      Remnant-C     1.39329825826509     1.26800692508298
23        Total-C     3.94555265914166     3.34731146188345
24       Total-CE     2.86879949951823     2.41218645408365
25       Total-FC    1.096251842953019    0.941145147852789
26        Total-L     7.56058107016467     6.20568381337301
27        Total-P   0.0143493105297104   0.0114645568888063
28       Total-PL     2.51379135328935     2.09464230212316
29       Total-TG    1.112295006139023    0.819695648480825
30         VLDL-C    0.641508535868598    0.613509404738198
31        VLDL-CE    0.389156706629375    0.382694386531959
32        VLDL-FC    0.254817287615571    0.226349535599773
33         VLDL-L     1.73593388966681     1.48317335282253
34         VLDL-P 0.000129853779965971 0.000121386744096677
35        VLDL-PL    0.418386187195962    0.364587104023911
36        VLDL-TG    0.670384841989725    0.467985846667001
37      non-HDL-C     2.86638522248820     2.48932961017285
                     V3                   V4                   V5
1                    TB                TB-DM                TB-DM
2               Month_2             Baseline              Month_2
3      1.09480462821923     1.11025804392067     1.21588787219860
4     0.736040922362810    0.832235621111175    0.923010893848697
5      2.01298763106602     2.15866412502834     2.68438853938550
6      4.85139348356051    13.22070203655920     9.02639898562571
7     0.790932836767744    1.192591987248245    0.851989557947065
8     0.969059252880914    0.924905604573749    0.980718975518094
9     0.739204140256214    0.681390489385612    0.763695912933656
10    0.214957169889902    0.221752547096731    0.221118674315735
11     2.21937985429730     2.30771050432966     2.31739585019526
12   0.0108573959487333   0.0107482664796886   0.0126068502858248
13     1.14348411831109     1.18622225463721     1.19718470086570
14    0.122799463593451    0.148928062719008    0.152205943214624
15     1.32453013355397     1.42648639386189     1.76812755997309
16    0.958984102880007    1.040771123518263    1.274832817035267
17    0.364339237334894    0.381935443544788    0.496420398661138
18     1.93109723439095     2.10559990182316     2.55245107443050
19  0.00103210140187649  0.00118538348954764  0.00128971732610804
20    0.490014418833902    0.529316010235051    0.628219965298633
21    0.123601352444293    0.155620146675135    0.167684414461182
22     1.32373778491971     1.48167826409354     1.61619716461310
23     3.60463254916668     3.70559445678950     4.56808925176494
24     2.57953680203162     2.65197030702908     3.29264681153027
25    1.021791713854857    1.063557367374476    1.289170810002882
26     6.76461953436458     7.34088154586122     8.32785993968164
27   0.0122571766408350   0.0122110873282506   0.0144221879627141
28     2.27825060089347     2.38206519187622     2.69334371218083
29    0.848594005509815    1.206239314479202    1.141178610548100
30    0.621030949158662    0.761935818928997    0.816117342966658
31    0.388083225627211    0.464300583100527    0.495336191843535
32    0.225919731176691    0.301015374897738    0.317771402866201
33     1.52720853457176     2.00092758592026     2.01351136217055
34 0.000119950829873412 0.000155080563777962 0.000158908397589347
35    0.371490377210629    0.489093221671945    0.497912040641468
36    0.506326882226039    0.750086555813354    0.691995007999054
37     2.61139245582618     2.87954430684275     3.42070252240637
Code
write_csv(tmp, file = path(path1, "output/Nightingale_paired_all_median.csv"), col_names = F)

Associations with metadata

Using linear regression to test the association of the scaled Concentration values with scaled Age, Sex, BMI, and HbA1c. The plot then just uses the estimate for each predictor on the outcome protein as the size of the square and the color is based on the FDR corrected p-value of the association.

Code
scaled_nightingale <- nightingale_qc_final |> 
  filter(!Biomarker %in% lipoprotein_subclasses) |> 
  select(SampleID, Phenotype, Timepoint, Biomarker, Concentration, Age, Sex, BMI, HbA1c, Haemoglobin, Timika_score, Month_6_Culture, Treatment_Failure) |> 
  mutate(Sex = case_when(Sex == "Male" ~ 1,
                         Sex == "Female" ~ 0),
         Month_6_Culture = case_when(Month_6_Culture == "Positive" ~ 1,
                                     Month_6_Culture == "Negative" ~ 0,
                                     TRUE ~ NA),
         Treatment_Failure = case_when(Treatment_Failure == "Yes" ~ 1,
                                       Treatment_Failure == "No" ~ 0,
                                       TRUE ~ NA)) |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  mutate(Concentration = scale(Concentration),
         BMI = scale(BMI),
         HbA1c = scale(HbA1c),
         Age = scale(Age),
         Sex = scale(Sex),
         Haemoglobin = scale(Haemoglobin),
         Timika_score = scale(Timika_score),
         Month_6_Culture = scale(Month_6_Culture),
         Treatment_Failure = scale(Treatment_Failure))

results_lm1 <- scaled_nightingale |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Age) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm2 <- scaled_nightingale |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ BMI) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm3 <- scaled_nightingale |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Sex) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm4 <- scaled_nightingale |> 
  filter(Phenotype != "DM") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Timika_score) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm5 <- scaled_nightingale |> 
  filter(Phenotype != "DM") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Haemoglobin) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm6 <- scaled_nightingale |> 
  filter(Phenotype != "TB") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ HbA1c) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm7 <- scaled_nightingale |> 
  filter(Phenotype != "DM",
         Timepoint == "Month_2") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Month_6_Culture) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm8 <- scaled_nightingale |> 
  filter(Phenotype != "DM",
         Timepoint == "Month_2") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Treatment_Failure) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm <- rbind(results_lm1, results_lm2, results_lm3, results_lm4, results_lm5, results_lm6, results_lm7, results_lm8) |> 
  filter(term != "(Intercept)") |> 
  select(-std.error, -statistic) |> 
  group_by(Phenotype, Timepoint, term) |> 
  mutate(abs_estimate = abs(estimate),
         direction = estimate / abs_estimate,
         p.adj = p.adjust(p.value, method = "fdr"),
         log10_p_value_and_direction = -log10(p.adj) * direction,
         FDR = cut(log10_p_value_and_direction, 
                   breaks =c(-35, -6, -4, -2.301, -2, -1.301, 1, 1.301, 2, 2.301, 4, 6, 35),
                   labels =c("<0.000001 (neg)", "<0.0001 (neg)", "<0.005 (neg)", "<0.01 (neg)", "<0.05 (neg)", "0", "0",
                             "<0.05 (pos)", "<0.01 (pos)", "<0.005 (pos)", "<0.0001 (pos)", "<0.000001 (pos)")))

# Creating bubble plot.
palette_heatmap <- c("<0.000001 (pos)"="#800026",
                     "<0.0001 (pos)"="#BD0026",
                     "<0.005 (pos)"="#E31A1C",
                     "<0.01 (pos)"="#FC4E2A",
                     "<0.05 (pos)"="#FD8D3C",
                     "non-signif (pos)"="khaki1",
                     #"non-signif"="white", 
                     "non-signif (neg)"="lightcyan",
                     "<0.05 (neg)"="#6BAED6",
                     "<0.01 (neg)"="#4292C6",
                     "<0.005 (neg)"="#2171B5",
                     "<0.0001 (neg)"="#08519C",
                     "<0.000001 (neg)"="#08306B")

results_lm |> 
  filter(Timepoint == "Baseline") |> 
  mutate(Biomarker = factor(Biomarker, levels = rev(unique(results_lm$Biomarker))),
         term = case_when(term == "Timika_score" ~ "Timika score",
                          term == "Sex" ~ "Sex (male)",
                          TRUE ~ term)) |> 
  ggplot(aes(x = term, 
             y = Biomarker, 
             size = abs_estimate, 
             color = FDR)) + 
  geom_point(shape = 15) + 
  scale_size_continuous(range = c(1, 7.5), breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)) +
  scale_color_manual(values = palette_heatmap, na.value = "grey90") +
  labs(size = "Effect size\n(absolute)") + 
  facet_grid(~ Phenotype, scale = "free") +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 0, size = 15), # changing metadata text size
        axis.text.y = element_text(size = 15),  # changing y axis text size (proteins)
        legend.title = element_text(size = 15),
        legend.text = element_text(size = 15), 
        legend.key.size = unit(0.3, "line"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(), 
        panel.background = element_rect(fill = "gainsboro"),
        panel.grid = element_blank(),      
        strip.text.y = element_text(size = 10), # changing facet grid text size
        strip.text.x = element_text(size = 18), # changing phenotype text size 
        strip.placement = "outside") + # for placing the strip outside of the axis names
  scale_x_discrete(position = "top")

Code
ggsave(filename = path(path1, "output/Baseline_BubblePlot_Nightingale.svg"), height = 17, width = 10, units = "in")

results_lm |> 
  filter(Timepoint == "Month_2",
         term %in% c("Month_6_Culture", "Treatment_Failure")) |> 
  mutate(Biomarker = factor(Biomarker, levels = rev(unique(results_lm$Biomarker)))) |> 
  ggplot(aes(x = term, 
             y = Biomarker, 
             size = abs_estimate, 
             color = FDR)) + 
  geom_point(shape = 15) + 
  scale_size_continuous(range = c(1, 7.5), breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)) +
  scale_color_manual(values = palette_heatmap, na.value = "grey90") +
  labs(size = "Effect size\n(absolute)") + 
  facet_grid(~ Phenotype, scale = "free") +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 0, size = 15), # changing metadata text size
        axis.text.y = element_text(size = 15),  # changing y axis text size (proteins)
        legend.title = element_text(size = 15),
        legend.text = element_text(size = 15), 
        legend.key.size = unit(0.3, "line"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(), 
        panel.background = element_rect(fill = "gainsboro"),
        panel.grid = element_blank(),      
        strip.text.y = element_text(size = 10), # changing facet grid text size
        strip.text.x = element_text(size = 18), # changing phenotype text size 
        strip.placement = "outside") + # for placing the strip outside of the axis names
  scale_x_discrete(position = "top")

Code
ggsave(filename = path(path1, "output/Month2_BubblePlot_Nightingale.png"), height = 17, width = 10, units = "in")




# Bubbleplot for Julia's selection. 
scaled_nightingale <- nightingale_qc_final |> 
  filter(Biomarker %in% Julias_selection) |>  # proteins specified here!
  select(SampleID, Phenotype, Timepoint, Biomarker, Concentration, Age, Sex, BMI, HbA1c, Haemoglobin, Timika_score, Month_6_Culture, Treatment_Failure) |> 
  mutate(Sex = case_when(Sex == "Male" ~ 1,
                         Sex == "Female" ~ 0),
         Month_6_Culture = case_when(Month_6_Culture == "Positive" ~ 1,
                                     Month_6_Culture == "Negative" ~ 0,
                                     TRUE ~ NA),
         Treatment_Failure = case_when(Treatment_Failure == "Yes" ~ 1,
                                       Treatment_Failure == "No" ~ 0,
                                       TRUE ~ NA)) |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  mutate(Concentration = scale(Concentration),
         BMI = scale(BMI),
         HbA1c = scale(HbA1c),
         Age = scale(Age),
         Sex = scale(Sex),
         Haemoglobin = scale(Haemoglobin),
         Timika_score = scale(Timika_score),
         Month_6_Culture = scale(Month_6_Culture),
         Treatment_Failure = scale(Treatment_Failure))

results_lm1 <- scaled_nightingale |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Age) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm2 <- scaled_nightingale |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ BMI) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm3 <- scaled_nightingale |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Sex) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm4 <- scaled_nightingale |> 
  filter(Phenotype != "DM") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Timika_score) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm5 <- scaled_nightingale |> 
  filter(Phenotype != "DM") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Haemoglobin) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm6 <- scaled_nightingale |> 
  filter(Phenotype != "TB") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ HbA1c) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm7 <- scaled_nightingale |> 
  filter(Phenotype != "DM",
         Timepoint == "Month_2") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Month_6_Culture) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm8 <- scaled_nightingale |> 
  filter(Phenotype != "DM",
         Timepoint == "Month_2") |> 
  group_by(Phenotype, Timepoint, Biomarker) |> 
  summarize(out = list(lm(Concentration ~ Treatment_Failure) |> 
                         broom::tidy())) |> 
  unnest(c(out))

results_lm <- rbind(results_lm1, results_lm2, results_lm3, results_lm4, results_lm5, results_lm6, results_lm7, results_lm8) |> 
  filter(term != "(Intercept)") |> 
  select(-std.error, -statistic) |> 
  group_by(Phenotype, Timepoint, term) |> 
  mutate(abs_estimate = abs(estimate),
         direction = estimate / abs_estimate,
         p.adj = p.adjust(p.value, method = "fdr"),
         log10_p_value_and_direction = -log10(p.adj) * direction,
         FDR = cut(log10_p_value_and_direction, 
                   breaks =c(-35, -6, -4, -2.301, -2, -1.301, 1, 1.301, 2, 2.301, 4, 6, 35),
                   labels =c("<0.000001 (neg)", "<0.0001 (neg)", "<0.005 (neg)", "<0.01 (neg)", "<0.05 (neg)", "0", "0",
                             "<0.05 (pos)", "<0.01 (pos)", "<0.005 (pos)", "<0.0001 (pos)", "<0.000001 (pos)")))

# Creating bubble plot.
palette_heatmap <- c("<0.000001 (pos)"="#800026",
                     "<0.0001 (pos)"="#BD0026",
                     "<0.005 (pos)"="#E31A1C",
                     "<0.01 (pos)"="#FC4E2A",
                     "<0.05 (pos)"="#FD8D3C",
                     "non-signif (pos)"="khaki1",
                     #"non-signif"="white", 
                     "non-signif (neg)"="lightcyan",
                     "<0.05 (neg)"="#6BAED6",
                     "<0.01 (neg)"="#4292C6",
                     "<0.005 (neg)"="#2171B5",
                     "<0.0001 (neg)"="#08519C",
                     "<0.000001 (neg)"="#08306B")

results_lm |> 
  filter(Timepoint == "Baseline") |> 
  mutate(Biomarker = factor(Biomarker, levels = rev(unique(results_lm$Biomarker))),
         term = case_when(term == "Timika_score" ~ "Timika score",
                          term == "Sex" ~ "Sex (male)",
                          TRUE ~ term)) |> 
  ggplot(aes(x = term, 
             y = Biomarker, 
             size = abs_estimate, 
             color = FDR)) + 
  geom_point(shape = 15) + 
  scale_size_continuous(range = c(1, 7.5), breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)) +
  scale_color_manual(values = palette_heatmap, na.value = "grey90") +
  labs(size = "Effect size\n(absolute)") + 
  facet_grid(~ Phenotype, scale = "free") +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 0, size = 20), # changing metadata text size
        axis.text.y = element_text(size = 15),  # changing y axis text size (proteins)
        legend.title = element_text(size = 15),
        legend.text = element_text(size = 15), 
        legend.key.size = unit(0.3, "line"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(), 
        panel.background = element_rect(fill = "gainsboro"),
        panel.grid = element_blank(),      
        strip.text.y = element_text(size = 10), # changing facet grid text size
        strip.text.x = element_text(size = 25), # changing phenotype text size 
        strip.placement = "outside") + # for placing the strip outside of the axis names
  scale_x_discrete(position = "top")

Code
ggsave(filename = path(path1, "output/Baseline_BubblePlot_Nightingale_Julias_selection.svg"), height = 15, width = 10, units = "in")

results_lm |> 
  filter(Timepoint == "Month_2",
         term %in% c("Month_6_Culture", "Treatment_Failure")) |> 
  mutate(Biomarker = factor(Biomarker, levels = rev(unique(results_lm$Biomarker)))) |> 
  ggplot(aes(x = term, 
             y = Biomarker, 
             size = abs_estimate, 
             color = FDR)) + 
  geom_point(shape = 15) + 
  scale_size_continuous(range = c(1, 7.5), breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)) +
  scale_color_manual(values = palette_heatmap, na.value = "grey90") +
  labs(size = "Effect size\n(absolute)") + 
  facet_grid(~ Phenotype, scale = "free") +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 0, size = 20), # changing metadata text size
        axis.text.y = element_text(size = 15),  # changing y axis text size (proteins)
        legend.title = element_text(size = 15),
        legend.text = element_text(size = 15), 
        legend.key.size = unit(0.3, "line"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(), 
        panel.background = element_rect(fill = "gainsboro"),
        panel.grid = element_blank(),      
        strip.text.y = element_text(size = 10), # changing facet grid text size
        strip.text.x = element_text(size = 25), # changing phenotype text size 
        strip.placement = "outside") + # for placing the strip outside of the axis names
  scale_x_discrete(position = "top")

Code
ggsave(filename = path(path1, "output/Month2_BubblePlot_Nightingale_Julias_selection.svg"), height = 15, width = 5, units = "in")

Boxplot binary variables

Code
# ApoA1 vs Sex in TB. 

# Filter and reshape the data. 
filtered_data <- nightingale_qc_final |>
  filter(Timepoint == "Baseline",
         Biomarker %in% Julias_selection,
         Phenotype == "TB") |>
  select(SampleID, Phenotype, Biomarker, Concentration, Age, Sex) |>
  pivot_wider(names_from = Biomarker, values_from = Concentration)

# Ensure no NA values. 
filtered_data <- filtered_data |>
  filter(!is.na(Sex), !is.na(`ApoA1`))

# Create a boxplot with regression line.
ggplot(filtered_data, aes(x = factor(Sex), y = `ApoA1`)) + 
  geom_boxplot(fill = "orange", color = "gray10") + 
  geom_jitter(width = 0.2, alpha = 0.6, color = "gray10") + 
  geom_smooth(method = "lm", aes(group = 1), color = "blue", size = 1.5, linetype = "dashed", se = FALSE) +  # Add regression line
  labs(x = "Sex", y = "ApoA1 (g/L)", title = "ApoA1 \nTB at baseline") + 
  scale_x_discrete(labels = c("0" = "Female", "1" = "Male")) +  # Convert 0/1 to Female/Male labels
  theme_minimal(base_size = 15) + 
  theme(axis.text.x = element_text(size = 14),  
        axis.text.y = element_text(size = 14),  
        axis.title.x = element_text(size = 16), 
        axis.title.y = element_text(size = 16), 
        plot.title = element_text(size = 18, hjust = 0.5))

3D Volcano Plot

Code
# # Testing with real data.
# test_metadata <- nightingale_qc_final |> 
#   filter(Timepoint == "Baseline") |> 
#   select(SampleID, Phenotype) |> 
#   mutate(Phenotype = factor(Phenotype, levels = c("TB", "DM", "TB-DM"))) |> 
#   unique()
# 
# test_matrix <- nightingale_qc_final |> 
#   filter(SampleID %in% test_metadata$SampleID,
#          Biomarker %in% Julias_selection) |> 
#   select(SampleID, Biomarker, Concentration) |> 
#   pivot_wider(names_from = Biomarker, values_from = Concentration) |> 
#   column_to_rownames("SampleID") |> 
#   as.matrix()
# 
# test_polar <- polar_coords(outcome = test_metadata$Phenotype,
#                            data = test_matrix)
# 
# # 3D volcano plot
# volcano3D(test_polar)
# 
# # 2D volcano plot 
# tmp <- radial_ggplot(polar = test_polar) 
# 
# tmp$data <- tmp$data |>
#   rownames_to_column("Gene") |>
#   mutate(Gene = case_when(lab != "ns" ~ Gene),
#          lab = case_when(lab == "D+" ~ "DM",
#                          lab == "D+TB-+" ~ "DM + TB-DM",
#                          lab == "TB-+" ~ "TB-DM",
#                          lab == "TB+TB-+" ~ "TB + TB-DM",
#                          TRUE ~ lab),
#          lab = factor(lab, levels = c("ns", "DM", "DM + TB-DM", "TB-DM", "TB + TB-DM")))
# #plus run the next line immediately after!
# tmp + 
#   ggrepel::geom_text_repel(aes(label = Gene), size = 3) +
#   theme(legend.text = element_text(size = 10),
#         legend.position = "bottom")
# 
# ggsave(filename = path(path1, "output/2DVolcano_Nightingale.pdf"), height = 4, width = 4, units = "in")

Kruskal wallis

Kruskal Wallis non-parametric test is used instead of the default one-way ANOVA of the polar_coords function to generate a three-way Volcano plot.

Code
# Make data frame with adj. p-values from previous Wilcox tests for each comparison. 
# the first column is a group test such as one-way ANOVA or Kruskal-Wallis test.
# columns 2-4 contain adj. p-values one for each comparison in the sequence A vs B, A vs C, B vs C. 
# A = TB, B = DM, C = TB-DM 
  
# Perform Kruskal Wallis. 
nightingale_qc_final$Phenotype <- ordered(nightingale_qc_final$Phenotype,
                         levels = c("TB", "DM", "TB-DM"))

kruskal_results <- nightingale_qc_final |> 
  filter(Biomarker %in% Julias_selection,
         Timepoint == "Baseline") |>
  group_by(Biomarker) |>   
  summarise(p_value = kruskal.test(Concentration ~ Phenotype)$p.value)

kruskal_results <- kruskal_results |> 
  mutate(adj_p_value = p.adjust(p_value, method = "BH"))

print(kruskal_results)
# A tibble: 35 × 3
   Biomarker       p_value adj_p_value
   <chr>             <dbl>       <dbl>
 1 ApoA1          4.60e-16    3.22e-15
 2 ApoB           1.30e- 5    1.34e- 5
 3 Clinical LDL-C 6.43e- 7    8.04e- 7
 4 Glucose        3.44e-30    1.21e-28
 5 GlycA          2.28e-24    3.99e-23
 6 HDL-C          1.48e- 7    2.00e- 7
 7 HDL-CE         1.12e- 7    1.57e- 7
 8 HDL-FC         1.15e- 6    1.30e- 6
 9 HDL-L          2.23e-10    6.51e-10
10 HDL-P          6.08e-20    7.09e-19
# ℹ 25 more rows
Code
# Generate data matrix for 3D Volcano. 
# making columns 2-4
tmp <- kruskal_results |> 
  select(Biomarker, Kruskal_result = adj_p_value) |> 
  
  left_join(NG_TB_vs_DM |>
              select(Biomarker, p.adj_TB_vs_DM = p.adj),
            by = "Biomarker") |>
  left_join(NG_TBDM_vs_TB |>
              select(Biomarker, p.adj_TBDM_vs_TB = p.adj),
            by = "Biomarker") |> 
  left_join(NG_TBDM_vs_DM |>
              select(Biomarker, p.adj_TBDM_vs_DM = p.adj),
            by = "Biomarker") |> 
    column_to_rownames(var = "Biomarker") |> 
  as.matrix()

# 3D Volcano plot. 
test_metadata <- nightingale_qc_final |> 
  filter(Biomarker %in% Julias_selection,
         Timepoint == "Baseline") |> 
  select(SampleID, Phenotype) |> 
  mutate(Phenotype = factor(Phenotype, levels = c("TB", "DM", "TB-DM"))) |> 
  unique()

test_matrix <- nightingale_qc_final |> 
  filter(SampleID %in% test_metadata$SampleID,
         Biomarker %in% Julias_selection,
         Timepoint == "Baseline") |>
  select(SampleID, Biomarker, Concentration) |>
  pivot_wider(names_from = Biomarker, values_from = Concentration) |>
  column_to_rownames("SampleID") |>
  as.matrix()

# order of proteins needs to be the same as in the matrix! Arrange proteins: 
tmp1 <- tmp[colnames(test_matrix),]

test_polar <- polar_coords(outcome = test_metadata$Phenotype,
                           data = test_matrix,
                           pvals = tmp1)

volcano3D(test_polar)
Code
# 2D Volcano plot. 
tmp <- radial_ggplot(polar = test_polar) 

tmp$data <- tmp$data |>
  rownames_to_column("Gene") |>
  mutate(Gene = case_when(lab != "ns" ~ Gene),
         lab = case_when(lab == "TB+" ~ "TB",
                         lab == "D+" ~ "DM",
                         lab == "D+TB-+" ~ "DM + TB-DM",
                         lab == "TB-+" ~ "TB-DM",
                         lab == "TB+TB-+" ~ "TB + TB-DM",
                         lab == "TB+D+" ~ "TB + DM",
                         TRUE ~ lab),
         lab = factor(lab, levels = c("ns", "DM", "TB", "TB-DM", "DM + TB-DM", "TB + TB-DM")))
#plus run the next line immediately after!
tmp + 
  ggrepel::geom_text_repel(aes(label = Gene), size = 3) +
  theme(legend.text = element_text(size = 10),
        legend.position = "bottom")

Code
ggsave(filename = path(path1, "output/2DVolcano_Nightingale.svg"), height = 6, width = 5, units = "in")

> Integration